In [11]:
import pandas as pd
import numpy as np

import sklearn.model_selection as sklearn_model_selection
import sklearn.linear_model as sklearn_linear
import sklearn.feature_extraction.text as sklearn_text
import sklearn.metrics as sklearn_metrics

In [6]:
imdb_reviews = pd.read_csv("imdb_dataset_prepared.csv")

X = imdb_reviews["review"]
y = imdb_reviews["sentiment"]

In [7]:
print("vectorisation started")
tfidf = sklearn_text.TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2))

X = tfidf.fit_transform(X)
print("vectorisation finished")
print(X)
print(X.shape)

vectorisation started
vectorisation finished
  (0, 151692)	0.05746945406236768
  (0, 695611)	0.04099650902199992
  (0, 637966)	0.049319306965853855
  (0, 303056)	0.053272583456377906
  (0, 242752)	0.04094095228547884
  (0, 108858)	0.03752661699942309
  (0, 705966)	0.024901589802556162
  (0, 294645)	0.018648834207459476
  (0, 319913)	0.06691359288348196
  (0, 679242)	0.029924801945832673
  (0, 695508)	0.04135906705930563
  (0, 131335)	0.05689758557971741
  (0, 375359)	0.06209357594610185
  (0, 706456)	0.03730443150034943
  (0, 453177)	0.07255265120589836
  (0, 671922)	0.07255265120589836
  (0, 201598)	0.06773263426851825
  (0, 444267)	0.07455313898207028
  (0, 543120)	0.06249104495492012
  (0, 429943)	0.05727355900872175
  (0, 341905)	0.03092397425308388
  (0, 609624)	0.05689758557971741
  (0, 633282)	0.03521432097235336
  (0, 311611)	0.06491310510731005
  (0, 644479)	0.04341856917438987
  :	:
  (49999, 672487)	0.02966945214110952
  (49999, 700605)	0.08627131236159445
  (49999, 103257)	

In [14]:
print("logistic regression training started")

X_train, X_test, y_train, y_test = sklearn_model_selection.train_test_split(X, y, test_size=0.2, shuffle=True)

logistic_model = sklearn_linear.LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

print("logistic regression training finished")

y_predicted = logistic_model.predict(X_test)

print("Accuancy: {:.2f} %".format(sklearn_metrics.accuracy_score(y_test, y_predicted) * 100))
print(sklearn_metrics.confusion_matrix(y_test, y_predicted))

logistic regression training started
logistic regression training finished
Accuancy: 89.97 %
[[4420  591]
 [ 412 4577]]


In [15]:
top_phrase_count = 20
tfidf_feature_names = tfidf.get_feature_names()

print("Negative:")
top_negative_phrases_indexes = np.argsort(logistic_model.coef_[0])[:top_phrase_count]
top_negative_phrases = [tfidf_feature_names[z] for z in top_negative_phrases_indexes]
print(top_negative_phrases)

print("Positive:")
top_positive_phrases_indexes = np.argsort(logistic_model.coef_[0])[-top_phrase_count:]
top_positive_phrases = [tfidf_feature_names[z] for z in top_positive_phrases_indexes]
print(top_positive_phrases)

Negative:
