In [1]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/LeGatto53/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/LeGatto53/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class LwClassifier:
    def __init__(self):
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
        posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

        texts = self.stem(negfeats + posfeats)
        self.labels = [0] * len(negfeats) + [1] * len(posfeats)
        cv = CountVectorizer()
        self.X = cv.fit_transform(texts).toarray()
        
    @staticmethod
    def stem(texts):
        stem_text = []
        ps = PorterStemmer()
        for i in texts:
            tmp = i.lower()
            tmp = tmp.split()
            lol = [ps.stem(word) for word in tmp if not word in set(stopwords.words('english'))]
            lol = ' '.join(lol)
            stem_text.append(lol)
        return stem_text
    
    def get_train_and_test(self):
        return  train_test_split(self.X, self.labels, test_size = 0.30, random_state = 0, shuffle=True)
    
    def pred(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, 
                                                            self.labels,
                                                            test_size = 0.30,
                                                            random_state = 0,
                                                            shuffle=True)
        logreg = LogisticRegression()
        logreg.fit(X_train, y_train)
        y_pred = logreg.predict(X_test)
        print('accuracy_score = {0}'.format(accuracy_score(y_test, y_pred)))
        return y_pred

In [3]:
obj = LwClassifier()

In [4]:
obj.pred()

accuracy_score = 0.8566666666666667


array([0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1,

In [5]:
X_train, X_test, y_train, y_test = obj.get_train_and_test()

Сравним с KNeighborsClassifier:

In [6]:
clf = KNeighborsClassifier(n_neighbors = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('accuracy_score = {0}'.format(accuracy_score(y_test, y_pred)))

accuracy_score = 0.6116666666666667


Сравним с MultinomialNB:

In [7]:
gnb = MultinomialNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print('accuracy_score = {0}'.format(accuracy_score(y_test, y_pred)))

accuracy_score = 0.8166666666666667
