In [180]:
import pandas as pd
import numpy as np
import sklearn as skl

In [181]:
train_raw = pd.read_csv("data/train_data.csv")
train_raw.columns = ["id", "other", "friend", "label"]
val_raw = pd.read_csv("data/val_data.csv")
val_raw.columns = ["id", "other", "friend", "label"]

In [182]:
print(train_raw.isnull().sum())
train_raw = train_raw.dropna()
train_raw.dropna().isnull().sum()

id        0
other     1
friend    0
label     0
dtype: int64


id        0
other     0
friend    0
label     0
dtype: int64

In [183]:
val_raw.isnull().sum()

id        0
other     0
friend    0
label     0
dtype: int64

In [184]:
train_raw_counts = train_raw.label.value_counts()
train_raw_counts

РОСС       4413
РЕЙЧЕЛ     4401
ЧЕНДЛЕР    4263
ДЖОУИ      4156
МОНИКА     4011
ФИБИ       3748
Name: label, dtype: int64

In [185]:
val_row_counts = val_raw.label.value_counts()
val_row_counts

РОСС       491
РЕЙЧЕЛ     489
ЧЕНДЛЕР    474
ДЖОУИ      462
МОНИКА     446
ФИБИ       416
Name: label, dtype: int64

In [186]:
for tr, vl in zip(train_raw_counts.items(), val_row_counts.items()):
    name, val_tr = tr
    name, val_vl = vl
    print("\t".join([name, "{:.2f}".format(val_tr / val_vl)]))



РОСС	8.99
РЕЙЧЕЛ	9.00
ЧЕНДЛЕР	8.99
ДЖОУИ	9.00
МОНИКА	8.99
ФИБИ	9.01


In [187]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [188]:
# united_raw = train_raw.append(val_raw)
united_raw = train_raw

# united_raw.friend = united_raw.friend.str.replace("[^\w\s]", "", regex=True).str.lower()
# val_raw.friend = val_raw.friend.str.replace("[^\w\s]", "", regex=True).str.lower()
# le = LabelEncoder() #толку нет
# united_raw.label = le.fit(united_raw.label).transform(united_raw.label)
# val_raw.label = le.fit(val_raw.label).transform(val_raw.label)
# united_raw["concat"] = united_raw.other + ' ' + united_raw.friend

In [192]:
#Preprocess
import spacy
sp = spacy.load('ru_core_news_sm')

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    # filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('russian')]
    # stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[w.lemma_ for w in sp(" ".join(tokens))]
    return " ".join(lemma_words)

united_raw["clean_friend"] = united_raw.friend.map(lambda s: preprocess(s))
val_raw["clean_friend"] = val_raw.friend.map(lambda s: preprocess(s))

In [197]:
x_train, x_test, y_train, y_test = train_test_split(
    united_raw.clean_friend,
    united_raw.label,
    test_size=0.1,
    random_state=42
)
x_val, y_val = val_raw.clean_friend, val_raw.label

In [199]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', SGDClassifier(random_state=42))])
sgd_ppl_clf.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', SGDClassifier(random_state=42))])

In [201]:
predicted_sgd = sgd_ppl_clf.predict(x_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

       ДЖОУИ       0.38      0.28      0.32       538
      МОНИКА       0.25      0.26      0.25       388
      РЕЙЧЕЛ       0.26      0.28      0.27       417
        РОСС       0.34      0.30      0.32       486
        ФИБИ       0.23      0.27      0.25       324
     ЧЕНДЛЕР       0.19      0.24      0.21       347

    accuracy                           0.28      2500
   macro avg       0.28      0.27      0.27      2500
weighted avg       0.29      0.28      0.28      2500



In [205]:
X_test = pd.read_csv('data/test.csv')
X_test["clean_friend"] = X_test.friend_response.map(lambda s: preprocess(s))

In [207]:
%%time
result = sgd_ppl_clf.predict(X_test.clean_friend)

res_df = pd.DataFrame(result.T)
res_df.columns = ["Category"]
res_df.to_csv("submission.csv", index_label="Id")

Wall time: 47 ms
