In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
!pip install pymorphy2

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
import os
import requests
from pathlib import Path
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
import pymorphy2
from collections import Counter
nltk.download('punkt')
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.random import default_rng
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
train_path = '/kaggle/input/vk-cup-1/train.csv'
test_path = '/kaggle/input/vk-cup-1/test.csv'
sample_submission_path = '/kaggle/input/vk-cup-1/sample_submission.csv'

In [None]:
data = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
url_stopwords_ru = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"


def get_text(url, encoding='utf-8', to_lower=True):
    url = str(url)
    if url.startswith('http'):
        r = requests.get(url)
        if not r.ok:
            r.raise_for_status()
        return r.text.lower() if to_lower else r.text
    elif os.path.exists(url):
        with open(url, encoding=encoding) as f:
            return f.read().lower() if to_lower else f.read()
    else:
        raise Exception('parameter [url] can be either URL or a filename')


def normalize_tokens(tokens):
    morph = pymorphy2.MorphAnalyzer()
    return [morph.parse(tok)[0].normal_form for tok in tokens]


def remove_stopwords(tokens, stopwords=None, min_length=4):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords and len(tok) >= min_length]
    return tokens


def tokenize_n_lemmatize(
    text, stopwords=None, normalize=True, 
    regexp=r'(?u)\b\w{4,}\b'):
    words = [w for sent in sent_tokenize(text)
             for w in regexp_tokenize(sent, regexp)]
    if normalize:
        words = normalize_tokens(words)
    if stopwords:
        words = remove_stopwords(words, stopwords)
    return words

stopwords_ru = get_text(url_stopwords_ru).splitlines()

In [None]:
def clean(doc):
    words = tokenize_n_lemmatize(doc, stopwords=stopwords_ru)
    return " ".join(words)

In [None]:
#%%time
#data['text'] = data['text'].apply(clean)

In [None]:
#data.to_csv('train_cleaned.csv')

In [None]:
#%%time
#test['text'] = test['text'].apply(clean)

In [None]:
#test.to_csv('test_cleaned.csv')

In [None]:
data = pd.read_csv('/kaggle/input/vk-cup-1/train_cleaned.csv')
test = pd.read_csv('/kaggle/input/vk-cup-1/test_cleaned.csv')

In [None]:
data.dropna(inplace=True)

In [None]:
rng = default_rng(13)
data_oid = data.oid.unique()
rng.shuffle(data_oid)

In [None]:
train_val_ids = np.split(data_oid, [3000])

train_ids = pd.Series(train_val_ids[0], name='oid')
val_ids = pd.Series(train_val_ids[1], name='oid')

train = pd.merge(data, train_ids, on='oid')
val = pd.merge(data, val_ids, on='oid')

In [None]:
train_posts = list(train['text'])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(train_posts)
train_posts = tfidf_vectorizer_vectors.toarray()

In [None]:
X_train = train_posts 
y_train = train['category']
print(X_train.shape, y_train.shape)

In [None]:
val_posts = list(val['text'])
X_val = tfidf_vectorizer.transform(val_posts).toarray()
y_val = val['category']
print(X_val.shape, y_val.shape)

In [None]:
gnb = GaussianNB() 
%time gnb.fit(X_train, y_train)

y_pred_train = gnb.predict(X_train)
y_pred_val = gnb.predict(X_val)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_val))

In [None]:
mnb = MultinomialNB() 
%time mnb.fit(X_train, y_train)

y_pred_mnb_train = mnb.predict(X_train)
y_pred_mnb_val = mnb.predict(X_val)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_mnb_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_mnb_val))

In [None]:
lr = LogisticRegression(random_state=13)
%time lr.fit(X_train, y_train)

y_pred_lr_train = lr.predict(X_train)
y_pred_lr_val = lr.predict(X_val)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_lr_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_lr_val))

In [None]:
svc =  LinearSVC(class_weight='balanced') 
%time svc.fit(X_train, y_train)

y_pred_svc_train = svc.predict(X_train)
y_pred_svc_val = svc.predict(X_val)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_svc_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_svc_val))

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=13)
%time dt.fit(X_train, y_train)

y_pred_dt_train = dt.predict(X_train)
y_pred_dt_val = dt.predict(X_val)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_dt_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_dt_val))

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Decision Tree', dt),
               ('Logistic Regression', lr),
                ('Naive Bayes', gnb),
                ('Multinomial Naive Bayes', mnb),
                ('Linear SVC', svc)
              ]
vc = VotingClassifier(estimators=classifiers)

vc.fit(X_train, y_train)
y_pred_vc_train=vc.predict(X_train)
y_pred_vc_val = vc.predict(X_val)
print("Training Accuracy score:",accuracy_score(y_train, y_pred_vc_train))
print("Testing Accuracy score:",accuracy_score(y_val, y_pred_vc_val))

In [None]:
X_test = list(test['text'])

In [None]:
test_posts = list(test['text'])
X_test = tfidf_vectorizer.transform(test_posts).toarray()

In [None]:
y_pred = vc.predict(X_test)

In [None]:
test['category'] = y_pred

In [None]:
test[test.oid==1622114]

In [None]:
grouped_test = test.groupby(by=['oid', 'category']).count().sort_values(by=['oid', 'text']).groupby(level=0).tail(1).reset_index()

In [None]:
subm = grouped_test[['oid', 'category']]

In [None]:
subm.to_csv('submission3.csv')