In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.model_selection import cross_val_score

In [4]:
from sklearn.pipeline import Pipeline

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
from sklearn.svm import LinearSVC

In [9]:
from sklearn.linear_model import SGDClassifier

In [10]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [11]:
data = pd.read_csv('reviews.csv', encoding='utf-8', index_col=0)

In [12]:
data.head()

Unnamed: 0,pos,neg
0,Всё устраивает.,Нет
1,"Дизайн, непонимаю почему людям так не нравится...","Хотя не знаю,для кого то может показаться и пл..."
2,"основная камера - шикарная. Хороший дисплей, с...",Явных нет.
3,Все шикарно,Нет
4,"Не знаю почему нет модели на 6128 Гб, но тут п...","Очень слабый аккумулятор, можно было хотя бы 4..."


In [20]:
X_train = np.hstack((data.pos, data.neg))

In [21]:
data.shape

(3070, 2)

In [22]:
y_train = np.array(['pos'] * 3070 + ['neg'] * 3070)

In [25]:
with open('test.csv', 'r', encoding='utf-8') as file:
    X_test = np.array( [i[8:-9] for i in file.read().split('\n\n')] )

In [33]:
X_test = X_test[:-1]

In [34]:
data_test = pd.read_csv('y_test.csv', encoding='utf-8')

In [35]:
y_test = np.array(data_test.value)

In [36]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6140,), (6140,), (100,), (100,))

In [37]:
estimators = []

In [38]:
for i in [CountVectorizer, TfidfVectorizer]:
    for j in [LogisticRegression, SGDClassifier, LinearSVC]:
        pipe = Pipeline([ ('vect', i()), ('class', j()) ])
        estimators.append(( str(i), str(j), cross_val_score(pipe, X_train, y_train).mean() ))



In [40]:
for i in estimators:
    print(i)

("<class 'sklearn.feature_extraction.text.CountVectorizer'>", "<class 'sklearn.linear_model.logistic.LogisticRegression'>", 0.8903868256659336)
("<class 'sklearn.feature_extraction.text.CountVectorizer'>", "<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>", 0.8807785516454872)
("<class 'sklearn.feature_extraction.text.CountVectorizer'>", "<class 'sklearn.svm.classes.LinearSVC'>", 0.885502422155018)
("<class 'sklearn.feature_extraction.text.TfidfVectorizer'>", "<class 'sklearn.linear_model.logistic.LogisticRegression'>", 0.8949482539609809)
("<class 'sklearn.feature_extraction.text.TfidfVectorizer'>", "<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>", 0.8967406868585045)
("<class 'sklearn.feature_extraction.text.TfidfVectorizer'>", "<class 'sklearn.svm.classes.LinearSVC'>", 0.8986944482628707)


In [41]:
vectorizer = TfidfVectorizer().fit(X_train)

In [42]:
X_train_vec = vectorizer.transform(X_train)

In [43]:
classifier = SGDClassifier(class_weight='balanced').fit(X_train_vec, y_train)

In [44]:
X_test_vec = vectorizer.transform(X_test)

In [45]:
y_pred = classifier.predict(X_test_vec)

In [49]:
from sklearn.metrics import accuracy_score

In [50]:
accuracy_score(y_pred, y_test)

0.85

In [68]:
pd.DataFrame({'Id': list(range(0, 100)), 'y': y_pred}).to_csv('submission.csv', encoding='utf-8', index=False)

Создадим файлы pickle для дальнейшего использования модели в web-api:

In [69]:
import pickle

In [70]:
pickle.dump(vectorizer, open('vectorizer.pickle', 'wb'))

In [71]:
pickle.dump(classifier, open('classifier.pickle', 'wb'))