In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import nltk
# nltk.download('movie_reviews')
# nltk.download("stopwords")

def write_ans(file, numb):
    with open("answer " + str(numb) + ".txt", "w+") as f:
        f.write(file)

In [3]:
neg_id = movie_reviews.fileids('neg')
pos_id = movie_reviews.fileids('pos')

In [4]:
neg_rev = [" ".join(list(movie_reviews.words(fileids=[f]))) for f in neg_id]
pos_rev = [" ".join(list(movie_reviews.words(fileids=[f]))) for f in pos_id]
sum_rev = neg_rev + pos_rev

In [5]:
print("Count of neg and pos reviews in movie reviews corpus:")
print("neg",len(neg_rev))
print("pos",len(pos_rev))
print("all", 2000)

Count of neg and pos reviews in movie reviews corpus:
neg 1000
pos 1000
all 2000


In [6]:
c = CountVectorizer()
X_train = c.fit_transform(sum_rev)
y_train = [0 for i in range(len(neg_rev))] + [1 for i in  range(len(pos_rev))]

In [7]:
print("Number of features(words)", len(c.get_feature_names()))

Number of features(words) 39659


In [8]:
model = LogisticRegression(random_state=42)
count_vec = CountVectorizer()
log_countvec = Pipeline([("count", count_vec),("log",model)])

In [10]:
acc_score = cross_val_score(log_countvec, sum_rev, y_train, scoring='accuracy')
roc_auc_score = cross_val_score(log_countvec, sum_rev, y_train, scoring='roc_auc')

In [11]:
np.mean(acc_score)

0.8360216503929078

In [12]:
np.mean(roc_auc_score)

0.9107825058014015

In [13]:
model_ = LogisticRegression(random_state=1)
coef = model_.fit(X_train, y_train).coef_.tolist()[0]

In [14]:
most_valuable_features = [coef.index(i) for i in sorted(coef)[:2]]
features = np.array(c.get_feature_names())
answer = " ".join(features[most_valuable_features].tolist())
answer

'bad unfortunately'

In [None]:
# write_ans(str(len(sum_rew))," rew_full")
# write_ans(str(len(pos_rew)/len(sum_rew)), " pos_coef")
# write_ans(str(len(c.get_feature_names())), " numb_features")
# write_ans(str(np.mean(acc_score)), " acc_score")
# write_ans(str(np.mean(roc_auc_score)), " roc_auc_score")
# write_ans(answer, " most_val")

**Hyperparameter Optimization**

In [15]:
cv = 5

acc_score = cross_val_score(log_countvec, sum_rev, y_train, scoring='accuracy', cv=cv)

a_mean = np.mean(acc_score)
a_std = np.std(acc_score)
print("mean,std accuracy_score of CountVectorizer_LogReg 5 fold cross val: ",a_mean, a_std)


log_tfidf = Pipeline([("count", TfidfVectorizer()), ("log", model)])
cv_logreg = cross_val_score(log_countvec, sum_rev, y_train, scoring="accuracy", cv=cv)

b_mean = np.mean(cv_logreg)
b_std = np.std(cv_logreg)
print("mean,std accuracy_score of TfidfVectorizer_LogReg 5 fold cross val: ",b_mean, b_std)

# write_ans(" ".join([str(i) for i in [a_mean, a_std, b_mean, b_std]]), "_ab_mean_std")

mean,std accuracy_score of CountVectorizer_LogReg 5 fold cross val:  0.8415000000000001 0.01677796173556255
mean,std accuracy_score of TfidfVectorizer_LogReg 5 fold cross val:  0.8415000000000001 0.01677796173556255


In [17]:
# Try to use min_df in CountVectorizer
min_df = [10,50]
results = list()

for min_df_ in min_df:
    pipe = Pipeline([("count", CountVectorizer(min_df=min_df_)), ("log", model)])
    results.append(cross_val_score(pipe, sum_rev, y_train, scoring="accuracy", cv=cv))
    
# write_ans(str(np.mean(results[0])) + " " + str(np.mean(results[1])), "min_df")
print(results)

[array([0.82  , 0.85  , 0.8325, 0.8525, 0.84  ]), array([0.7925, 0.825 , 0.8025, 0.8175, 0.8275])]


In [18]:
# finding worst model
models = [LogisticRegression(random_state=42), LinearSVC(random_state=42), 
          SGDClassifier(random_state=42)]

model_results = list()
for mod in models:
    pipe = Pipeline([("count", CountVectorizer()), ("mod", mod)])
    model_results.append(cross_val_score(pipe, sum_rev, y_train, scoring="accuracy", cv=cv))

In [19]:
model_results = [np.mean(i) for i in model_results]
print(model_results)
# write_ans(str(min(model_results)), "worst_model")

[0.8415000000000001, 0.8325000000000001, 0.74]


In [20]:
# Same models w/o stopwords in dictionary


mod = [stopwords.words("english"), "english"]

model_results_stopwords = list()
for sw in mod:
    pipe = Pipeline([("count", CountVectorizer(stop_words=sw)), ("mod", LogisticRegression())])
    model_results_stopwords.append(cross_val_score(pipe, sum_rev, y_train, scoring="accuracy", cv=cv))
model_results_stopwords = [np.mean(i) for i in model_results_stopwords]
model_results_stopwords    

[0.841, 0.8390000000000001]

Now all of the models passed barrier of .80;   
What's a good and predictible result - stopwords stoils dict

In [21]:
# added bigrams to CountVectorizer

model_results_bigram = list()
model_results_bigram.append(cross_val_score(Pipeline([("count", CountVectorizer(stop_words=mod[0], ngram_range=(1,2))), ("mod", LogisticRegression(random_state=42))]), sum_rev, y_train, scoring="accuracy", cv=cv))
model_results_bigram.append(cross_val_score(Pipeline([("count", CountVectorizer(stop_words=mod[0], ngram_range=(3,5), analyzer='char_wb')), ("mod", LogisticRegression(random_state=42))]), sum_rev, y_train, scoring="accuracy", cv=cv))

In [22]:
model_results_bigram

[array([0.8225, 0.8375, 0.8325, 0.865 , 0.86  ]),
 array([0.83  , 0.835 , 0.8125, 0.8075, 0.815 ])]

In [None]:
# write_ans(str(np.mean(model_results_bigram[0])) + " " + str(np.mean(model_results_bigram[1])), "ngramm")