#### Импортируем необходимые библиотеки

In [158]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords

In [159]:
# Импортируем warnings filter для игнорирования ошибок
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

#### Загружаем данные

In [160]:
train = pd.read_csv('products_sentiment_train.tsv', sep = '\t', header = None)
print(train.head(5))
print(train.shape)

                                                   0  1
0          2 . take around 10,000 640x480 pictures .  1
1  i downloaded a trial version of computer assoc...  1
2  the wrt54g plus the hga7t is a perfect solutio...  1
3  i dont especially like how music files are uns...  0
4  i was using the cheapie pail ... and it worked...  1
(2000, 2)


#### Создаем список из текстов всех имеющихся отзывов (texts), а также список с классами, которые будет использовать классификатор (labels) - 0 для негативных отзывов и 1 для позитивных.

In [161]:
texts = list(train[0][:])
labels = list(train[1][:])
print(texts[:5])
print()
print(labels[:5])

['2 . take around 10,000 640x480 pictures .', 'i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .', 'the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .', 'i dont especially like how music files are unstructured ; basically they are just dumped into one folder with no organization , like you might have in windows explorer folders and subfolders .', 'i was using the cheapie pail ... and it worked ok until the opening device fell apart .']

[1, 1, 1, 0, 1]


In [162]:
print('Количество отзывов =', len(texts))
print('Доля класса 1 в выборке =', sum(labels)/len(labels))

Количество отзывов = 2000
Доля класса 1 в выборке = 0.637


### Здесь и далее оценка качества будет выполняться с помощью cross_val_score.

In [163]:
# Оценка качества работы разных классификаторов

def text_classifier(vectorizer, classifier):
    return Pipeline([("vectorizer", vectorizer),
                    ("classifier", classifier)])

In [164]:
for vec in [CountVectorizer, TfidfVectorizer]:
    for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
        print("vectorizer -", vec)
        print("classifier -", clf)        
        print('crossvalscore mean =', cross_val_score(text_classifier(vec(), clf()), texts, labels, cv=5).mean())
        print()

vectorizer - <class 'sklearn.feature_extraction.text.CountVectorizer'>
classifier - <class 'sklearn.linear_model.logistic.LogisticRegression'>
crossvalscore mean = 0.7684956843480272

vectorizer - <class 'sklearn.feature_extraction.text.CountVectorizer'>
classifier - <class 'sklearn.svm.classes.LinearSVC'>
crossvalscore mean = 0.754000653129082

vectorizer - <class 'sklearn.feature_extraction.text.CountVectorizer'>
classifier - <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
crossvalscore mean = 0.7404818436365227

vectorizer - <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
classifier - <class 'sklearn.linear_model.logistic.LogisticRegression'>
crossvalscore mean = 0.7665031843949025

vectorizer - <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
classifier - <class 'sklearn.svm.classes.LinearSVC'>
crossvalscore mean = 0.7684856717854487

vectorizer - <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
classifier - <class 'sklearn.linear_model.

### Выбираем CountVectorizer и LogisticRegression

In [165]:
# Загружаем стоп слова из nltk
stop_words = set(stopwords.words('english')) 

In [166]:
text_classifier = Pipeline([("vectorizer", CountVectorizer(stop_words=stop_words)),
                            ("classifier", LogisticRegression())])

print(cross_val_score(text_classifier, texts, labels, cv=5).mean())

0.7540043687773048


In [167]:
text_classifier = Pipeline([("vectorizer", CountVectorizer(stop_words='english')),
                            ("classifier", LogisticRegression())])

print(cross_val_score(text_classifier, texts, labels, cv=5).mean())

0.7469993812461327


#### Результат только ухудшился, не будем использовать стоп слова

#### Попробуем в CountVectorizer добавить к словам биграммы и измерить качество модели.

In [168]:
for i in range(1,3):
    for j in range(i,4):        
        print('i =', i)
        print('j =', j)
        text_classifier = Pipeline([("vectorizer", CountVectorizer(ngram_range=(i, j))),
                            ("classifier", LogisticRegression())])

        print(cross_val_score(text_classifier, texts, labels, cv=5).mean())
        print()

i = 1
j = 1
0.7684956843480272

i = 1
j = 2
0.7705044437777736

i = 1
j = 3
0.7665106656916605

i = 2
j = 2
0.7159955530972069

i = 2
j = 3
0.6969954905968162



#### Остановимся на ngram_range=(1, 2)

#### Загрузим в модель тестовые данные и составим таблицу с ответами.

In [169]:
test = pd.read_csv('products_sentiment_test.tsv', sep = '\t')
print(test.head(5))
print(test.shape)
print()
test_texts = list(test['text'])
print(test_texts[:5])

   Id                                               text
0   0  so , why the small digital elph , rather than ...
1   1  3/4 way through the first disk we played on it...
2   2  better for the zen micro is outlook compatibil...
3   3    6 . play gameboy color games on it with goboy .
4   4  likewise , i 've heard norton 2004 professiona...
(500, 2)

["so , why the small digital elph , rather than one of the other cameras with better resolution or picture quality ? size [ + 2 ] # # because , unless it 's small , i won 't cary it around .", '3/4 way through the first disk we played on it ( naturally on 31 days after purchase ) the dvd player froze . ', 'better for the zen micro is outlook compatibility .', '6 . play gameboy color games on it with goboy .', "likewise , i 've heard norton 2004 professional version is fine too ."]


In [219]:
clf_pipeline = Pipeline([("vectorizer", CountVectorizer(ngram_range=(1, 2))),
                         ("classifier", LogisticRegression())])
clf_pipeline.fit(texts, labels)
predict = clf_pipeline.predict(test_texts)
print(predict)

[1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0
 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1
 1 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1
 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0
 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0
 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1
 0 1 0 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1
 1 1 0 1 1 1 1 1 1 1 0 1 

In [217]:
def write_answer_1(optimal_d):
    with open("clf_sentiment.csv", "w") as fout:
        fout.write('Id,y')
        fout.write('\n')
        for i in range(len(test)):
            fout.write(str(test['Id'][i]))
            fout.write(',')
            fout.write(str(predict[i]))
            fout.write('\n')

In [218]:
write_answer_1(predict_list)