In [1]:
import pandas as pd
import numpy as np

In [2]:
from nltk.corpus import movie_reviews 
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import RegexpTokenizer
import random

In [3]:
fn='rt-polarity.neg'
with open(fn, "r",encoding='utf-8', errors='ignore') as f: # some invalid symbols encountered 
    content = f.read()  
texts_neg=  content.splitlines()
print ('len of texts_neg = {:,}'.format (len(texts_neg)))
for review in texts_neg[:5]:
    print ( '\n', review)

len of texts_neg = 5,331

 simplistic , silly and tedious . 

 it's so laddish and juvenile , only teenage boys could possibly find it funny . 

 exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

 [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

 a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 


In [4]:
fn='rt-polarity.pos'

with open(fn, "r",encoding='utf-8', errors='ignore') as f:
    content = f.read()
texts_pos=  content.splitlines()
print ('len of texts_pos = {:,}'.format (len(texts_pos)))
for review in texts_pos[:5]:
    print ('\n', review)

len of texts_pos = 5,331

 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

 effective but too-tepid biopic

 if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

 emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


In [19]:
positive_df = pd.DataFrame({'text':texts_pos, 'label':np.ones(len(texts_pos), dtype=int)})
negative_df = pd.DataFrame({'text':texts_neg, 'label':np.zeros(len(texts_neg), dtype=int)})
general_df = pd.concat([positive_df, negative_df], ignore_index=True)

In [24]:
general_df

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
10657,a terrible movie that some people will neverth...,0
10658,there are many definitions of 'time waster' bu...,0
10659,"as it stands , crocodile hunter has the hurrie...",0
10660,the thing looks like a made-for-home-video qui...,0


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


X_train, X_test, y_train, y_test = train_test_split(general_df['text'], general_df['label'], random_state=0)

In [33]:
vect = CountVectorizer(min_df=5, max_features=50000, ngram_range=(1,2)).fit(X_train) # Fit the CountVectorizer to the training data
print('features samples:\n{}'.format(vect.get_feature_names()[::2000])) # display each 2000-th feature 
print ('\nlen of features {:,}'.format(len(vect.get_feature_names())))

features samples:
['10', 'filmmaking', 'on an', 'tour de']

len of features 6,637


In [34]:
X_train_vectorized = vect.transform(X_train) # indeces of existing words from vocabulary and their count in current text
X_train_vectorized

<7996x6637 sparse matrix of type '<class 'numpy.int64'>'
	with 153101 stored elements in Compressed Sparse Row format>

In [36]:
clf = LogisticRegression(max_iter=2000).fit(X_train_vectorized, y_train) # Train the model

In [37]:
predictions = clf.predict(vect.transform(X_test)) # Predict the transformed test documents
print('f1: ', f1_score(y_test, predictions)) 
scores = clf.decision_function(vect.transform(X_test)) 
print('AUC: ', roc_auc_score(y_test, scores))

f1:  0.7507507507507508
AUC:  0.8297838556432501


In [38]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = clf.coef_[0].argsort() # ascending  [0] is just squeeze from shape (1,n)
clf.coef_.shape, clf.coef_[0].shape, sorted(clf.coef_[0])[:10], sorted(clf.coef_[0])[-11:-1]

((1, 6637),
 (6637,),
 [-2.126507615515334,
  -1.7118468955443682,
  -1.6742039220589116,
  -1.6248761513916739,
  -1.6079779191482997,
  -1.6065869617048836,
  -1.5971901010318146,
  -1.539170632535987,
  -1.5390690413009651,
  -1.5204754204213107],
 [1.3923600940593244,
  1.399524092660697,
  1.4250090757216813,
  1.4345771883472216,
  1.4421737057980855,
  1.4522535625679653,
  1.4794854216795585,
  1.5392297920725193,
  1.5534451510195055,
  1.6044960938854536])

In [39]:
print('Smallest coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
# model.coef_[0][sorted_coef_index[0]] the smallest 

Smallest coefs:
['dull' 'badly' 'boring' 'tedious' 'incoherent' 'disappointment' 'lacks'
 'tv' 'mediocre' 'stupid']

Largest Coefs: 
['better than' 'masterpiece' 'powerful' 'remarkable' 'works' 'solid'
 'enjoyable' 'warm' 'engrossing' 'unexpected']
