In [2]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [3]:
clean = pd.read_csv("data/clean.csv")

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    clean.friend,
    clean.label,
    test_size=0.2,
    random_state=42,
    shuffle=True
)
# x_val, y_val = val_raw.clean_friend, val_raw.label

In [5]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # ('sgd_clf', SGDClassifier(random_state=42))
    ('clf', LogisticRegression())
])
# sgd_ppl_clf.fit(x_train, y_train)

In [39]:
%%time
from time import time

parameters = {
    # "vect__max_df": (0.5, 1.0),
    # # 'vect__max_features': (None, 5000, 10000, 50000),
    # "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    'tfidf__norm': ['l2'],
    "clf__max_iter": (125, 250, 300),
    "clf__random_state": (0, 1, 2),
    # "clf__alpha": (0.00001, 0.000001),
    "clf__C": (0.5, 0.55, 0.625),
    "clf__penalty": ["l2"],
    # 'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(sgd_ppl_clf, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in sgd_ppl_clf.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__norm': ['l2'], 'clf__max_iter': (125, 250, 300), 'clf__random_state': (0, 1, 2), 'clf__C': (0.5, 0.55, 0.625), 'clf__penalty': ['l2']}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
done in 169.602s

Best score: 0.258
Best parameters set:
	clf__C: 0.55
	clf__max_iter: 250
	clf__penalty: 'l2'
	clf__random_state: 0
	tfidf__norm: 'l2'
Wall time: 2min 49s


In [7]:
%%time
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2')),
    ('clf', LogisticRegression(C=0.55, max_iter=250, penalty='l2', random_state=0))
])
sgd_ppl_clf.fit(x_train, y_train)

predicted_sgd = sgd_ppl_clf.predict(x_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

       ДЖОУИ       0.29      0.30      0.30       887
      МОНИКА       0.22      0.26      0.24       743
      РЕЙЧЕЛ       0.32      0.29      0.30      1099
        РОСС       0.29      0.24      0.26      1139
        ФИБИ       0.21      0.27      0.24       655
     ЧЕНДЛЕР       0.28      0.25      0.27      1031

    accuracy                           0.27      5554
   macro avg       0.27      0.27      0.27      5554
weighted avg       0.27      0.27      0.27      5554

Wall time: 4.9 s


In [9]:
%%time
X_test = pd.read_csv('data/test_clean.csv')
# X_test["clean_friend"] = X_test.friend_response.map(lambda s: preprocess(s))

Wall time: 33 ms


In [10]:
%%time
result = sgd_ppl_clf.predict(X_test.clean_friend)

res_df = pd.DataFrame(result.T)
res_df.columns = ["Category"]
res_df.to_csv("submission.csv", index_label="Id")

Wall time: 61 ms
