In [25]:
from nltk.corpus import movie_reviews
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Strix\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [26]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [27]:
data = pd.DataFrame(columns=['text', 'category'], data=documents)
data['joined_text'] = data.text.apply(lambda x: ' '.join(x))

data.category = data.category.replace({'neg': 0, 'pos': 1})

print(data.shape)
data.head()

(2000, 3)


Unnamed: 0,text,category,joined_text
0,"[plot, :, two, teen, couples, go, to, a, churc...",0,"plot : two teen couples go to a church party ,..."
1,"[the, happy, bastard, ', s, quick, movie, revi...",0,the happy bastard ' s quick movie review damn ...
2,"[it, is, movies, like, these, that, make, a, j...",0,it is movies like these that make a jaded movi...
3,"["", quest, for, camelot, "", is, warner, bros, ...",0,""" quest for camelot "" is warner bros . ' first..."
4,"[synopsis, :, a, mentally, unstable, man, unde...",0,synopsis : a mentally unstable man undergoing ...


In [28]:
print('Доля позитивных отзывов - ', len(data[data.category == 1]) / len(data))

Доля позитивных отзывов -  0.5


In [29]:
transformer = CountVectorizer()
transformed_data = transformer.fit_transform(data.joined_text)

In [30]:
transformed_data.shape

(2000, 39659)

### CountVectorizer + LogisticRegression

In [31]:
count_text_pipeline = make_pipeline(CountVectorizer(), LogisticRegression(random_state=0, max_iter=1000))

In [32]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8445  with std -  0.024617067250182322


In [33]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9162899999999998  with std -  0.011492177774469012


### TfidfVectorizer + LogisticRegression

In [34]:
tfidf_text_pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(random_state=0, max_iter=1000))

In [35]:
accuracy_cv_score = cross_val_score(tfidf_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8205  with std -  0.003999999999999995


In [36]:
roc_auc_cv_score = cross_val_score(tfidf_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9055949999999999  with std -  0.008439096515622961


### CountVectorizer with different in_df + LogisticRegression

In [37]:
count_text_pipeline = make_pipeline(CountVectorizer(min_df=10), LogisticRegression(random_state=0, max_iter=1000))

In [38]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.836  with std -  0.024423349483639615


In [39]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.913485  with std -  0.010770220517705277


In [40]:
count_text_pipeline = make_pipeline(CountVectorizer(min_df=50), LogisticRegression(random_state=0, max_iter=1000))

In [41]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8095000000000001  with std -  0.016837458240482737


In [42]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.8947899999999999  with std -  0.010612134563790693


### Different classifiers with CountVectorizer

In [46]:
count_text_pipeline = make_pipeline(CountVectorizer(), SGDClassifier(random_state=0, max_iter=1000))

In [47]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8300000000000001  with std -  0.017464249196573002


In [48]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9107049999999999  with std -  0.013803309385795888


In [57]:
count_text_pipeline = make_pipeline(CountVectorizer(), LinearSVC(random_state=0, max_iter=10000))

In [58]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8325000000000001  with std -  0.0162788205960997




In [59]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.907465  with std -  0.01067036316157985




### Add stopwords

In [60]:
eng_stopwords = nltk.corpus.stopwords.words('english')

In [62]:
count_text_pipeline = \
    make_pipeline(CountVectorizer(stop_words=eng_stopwords), LogisticRegression(random_state=0, max_iter=1000))

In [63]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8400000000000001  with std -  0.01129158979063624


In [64]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9231449999999999  with std -  0.009511406310320292


In [65]:
count_text_pipeline = \
    make_pipeline(CountVectorizer(stop_words='english'), LogisticRegression(random_state=0, max_iter=1000))

In [66]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8365  with std -  0.013472193585307468


In [67]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9165949999999998  with std -  0.010291469768696798


### Add n-grams

In [68]:
count_text_pipeline = \
    make_pipeline(CountVectorizer(ngram_range=(1, 2)), LogisticRegression(random_state=0, max_iter=1000))

In [69]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.8525  with std -  0.022416511771459906


In [70]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.9229050000000001  with std -  0.009370957795230924


In [71]:
count_text_pipeline = \
    make_pipeline(CountVectorizer(ngram_range=(3, 5), analyzer='char_wb'), LogisticRegression(random_state=0, max_iter=1000))

In [72]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.819  with std -  0.008888194417315589


In [73]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=data.joined_text, y=data.category, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.899955  with std -  0.003491682975299961


### Feature importance

In [89]:
df_coef = pd.DataFrame()
df_coef['feature']=text_pipeline.named_steps.countvectorizer.get_feature_names_out()
df_coef['coef']=np.abs(text_pipeline.named_steps.logisticregression.coef_[0])

df_coef.sort_values(by='coef', ascending=False).head(5)

Unnamed: 0,feature,coef
2954,bad,0.792851
37056,unfortunately,0.628502
39195,worst,0.575192
14159,fun,0.561055
24063,nothing,0.518571
