## Классификатор отзывов
 В качестве основы выбрана модель SVC с параметрами, определенными на предыдущей неделе, и обученная на корпусе отзывов на фильмы.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from nltk.corpus import movie_reviews
from sklearn.metrics import accuracy_score
import joblib
import pickle

### Данные

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [3]:
negfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

### Настройка классификатора

In [9]:
def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [10]:
def make_classifier(text, label):
    clf = make_pipeline(TfidfVectorizer(ngram_range=(1,7), min_df=1, max_df=0.72, stop_words=None), 
                             TfidfTransformer(),
                             LinearSVC(max_iter=5000, tol=0.001, loss='hinge', C=1.9, random_state=1))
    clf.fit(text, label)
    return clf

In [12]:
%%time
classifier = make_classifier(texts, labels)
classifier

Wall time: 1min 30s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.72, max_features=None,
                                 min_df=1, ngram_range=(1, 7), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pat...\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('

In [15]:
accuracy_score(labels, classifier.predict(texts))

1.0

In [26]:
with open("sentement_classifier", 'wb') as f:
    pickle.dump(classifier, f)

### Тестирование

In [19]:
cls = joblib.load('sentement_classifier')

In [20]:
cls

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.72, max_features=None,
                                 min_df=1, ngram_range=(1, 7), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pat...\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('

In [24]:
print ('Отзыв:', posfeats[0])
print ('true label', labels[len(negfeats)])
print ('predict label', cls.predict([posfeats[0]])[0])


Отзыв: films adapted from comic books have had plenty of success , whether they ' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there ' s never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid ' 80s with a 12 - part series called the watchmen . to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . in other words , don ' t dismiss this film because of its source . if you can get past the whole comic book thing , you might find another stumbling block in from hell ' s directors , albert and allen hughes . getting the hughes brothers to direct this seem

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [23]:
print ('Отзыв:', negfeats[0])
print ('true label', labels[0])
print ('predict label', cls.predict([negfeats[0]])[0])

Отзыв: plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have 