# <h1 align= 'center'>Base Models</h1>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

from scipy.stats import uniform
from pprint import pprint


  import pandas.util.testing as tm


In [2]:
# for importing data to colab
from google.colab import drive 
drive.mount('/content/drive')

# read from local dataset
# movies = pd.read_csv('data/imdb_data.csv')
# movies.sample(7)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## <h2> <center>Load preprocessed data</center></h2>

In [3]:
movies = pd.read_csv('/content/drive/My Drive/Colab Notebooks/preprocessed_data.csv')
movies.sample(7)

Unnamed: 0.1,Unnamed: 0,review,sentiment
49805,49805,saw last night someon low expect begin still d...,0
44227,44227,german stand open get mow machin gun good guy ...,0
23806,23806,came across insomniac nightmar look offbeat i...,1
39261,39261,sublim way reach beauti sublim stuff made best...,1
11306,11306,fair good romant comedi think ever seen meg lo...,1
12772,12772,watch seagal expect good action expect fight l...,0
30454,30454,like make clear religi atheist could see richa...,0


## <h2> <center>Text feature extraction</center></h2>

In [4]:
# todo: normalize if overfit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif


top_k = 10000

tfidf = TfidfVectorizer(
                    ngram_range = (1,2),
                    dtype = 'int32',
                    decode_error = 'replace',
                    analyzer = 'word',
                    min_df = 2,
                    norm = 'l2'
                    )
tfidf_reviews = tfidf.fit_transform(movies['review'])

# Select best 10,000(k) features, with feature importance measured by f_classif
selector = SelectKBest(f_classif, k = min(top_k, tfidf_reviews.shape[1]))
selector.fit(tfidf_reviews, movies['sentiment'])
vect_reviews = selector.transform(tfidf_reviews).astype('float32')

vect_reviews.shape



(50000, 10000)

## <h2> <center>Split the data</center></h2>

In [5]:
X = vect_reviews.toarray()
y = (np.array(movies['sentiment']))

# Here we split data to training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
print(f"Train dataset shape: {X_train.shape}, \nTest dataset shape: {X_test.shape}")

Train dataset shape: (40000, 10000), 
Test dataset shape: (10000, 10000)


## <h2> <center>Modeling the data</center></h2>

### <h2> <center>Base Logistic Regression model</center></h2>

In [9]:
lr=LogisticRegression(max_iter=500 ,random_state=7, 
                      C = 3.4752058328312345, penalty = 'l2') # found from Random Search 
lr.fit(X_train, y_train)

LogisticRegression(C=3.4752058328312345, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=7, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
"""
save the model
file =  open('lrcv_model.sav', 'wb')
pickle.dump(lrcv, file)
file.close()
"""

"\nsave the model\nfile =  open('lrcv_model.sav', 'wb')\npickle.dump(lrcv, file)\nfile.close()\n"

In [0]:
"""
lr_clf = pickle.load(open('lrcv_model.sav', 'rb'))


pred = lr_clf.predict(Xtest)
print("Accuracy: ", accuracy_score(pred, ytest)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, ytest))
"""

'\nlr_clf = pickle.load(open(\'lrcv_model.sav\', \'rb\'))\n\n\npred = lr_clf.predict(Xtest)\nprint("Accuracy: ", accuracy_score(pred, ytest)*100)\n\nprint("Confusion Matrix:")\npd.DataFrame(confusion_matrix(pred, ytest))\n'

In [10]:
pred = lr.predict(X_test)
pred_train = lr.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

Training set Accuracy:  91.55250000000001
Test set Accuracy:  89.96
Confusion Matrix:


Unnamed: 0,0,1
0,4436,455
1,549,4560


In [11]:
penalty = ['l1', 'l2']
# Create regularization hyperparameter distribution using uniform distribution
C = uniform(loc=0, scale=4)
logistic = LogisticRegression()
# Create hyperparameter options
hyperparameters = dict(C=C,penalty=penalty)
pprint(hyperparameters)
lr_best = RandomizedSearchCV(logistic, hyperparameters, random_state=7, n_iter=3, scoring = 'accuracy', cv=2, verbose=2, refit=True, return_train_score = True)

{'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa4e84d75c0>,
 'penalty': ['l1', 'l2']}


In [12]:
# lr_best.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] C=0.3052331574958287, penalty=l2 ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................. C=0.3052331574958287, penalty=l2, total=  13.8s
[CV] C=0.3052331574958287, penalty=l2 ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.7s remaining:    0.0s


KeyboardInterrupt: ignored

In [0]:
lr_best_penalty = best_lr.best_estimator_.get_params()['penalty']
lr_best_c = best_lr.best_estimator_.get_params()['C']
print(f'Best Penalty: {lr_best_penalty}')
print(f'Best C: {lr_best_c}')

lr_best.best_params_

Best Penalty: l2
Best C: 3.4752058328312345


In [0]:
pred = lr_best.predict(X_test)
pred_train = lr_best.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

Training set Accuracy:  91.55250000000001
Test set Accuracy:  89.96
Confusion Matrix:


Unnamed: 0,0,1
0,4436,455
1,549,4560


### <h2> <center>SVM Classifier</center></h2>

In [5]:
svm = SGDClassifier(random_state = 7, penalty = 'l2', loss = 'hinge')

svm.fit(X_train,y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='squared_hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=7, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [6]:
pred = svm.predict(X_test)
pred_train = svm.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

Training set Accuracy:  89.99000000000001
Test set Accuracy:  87.97
Confusion Matrix:


Unnamed: 0,0,1
0,4376,594
1,609,4421


### <h2> <center>Multinomial Bayes Classifier</center></h2>

In [15]:
#training the model
mnb = MultinomialNB(alpha = 0.001) # best alpha found from Grid Search
#fitting the svm for bag of words
mnb.fit(X_train, y_train)

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

In [16]:
pred = mnb.predict(X_test)
pred_train = mnb.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

Training set Accuracy:  91.1825
Test set Accuracy:  90.25999999999999
Confusion Matrix:


Unnamed: 0,0,1
0,4458,447
1,527,4568


In [17]:
alpha =  [10, 1, 0.1, 0.001, 0.0001, 0.00001]
hyperparam = dict(alpha = alpha)
mnb_best = GridSearchCV(mnb, hyperparam, scoring = 'accuracy', cv =2, refit = True, verbose = 3, return_train_score = True)
pprint(hyperparam)

{'alpha': [10, 1, 0.1, 0.001, 0.0001, 1e-05]}


In [18]:
mnb_best.fit(X_train, y_train)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] alpha=10 ........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ alpha=10, score=(train=0.880, test=0.869), total=   5.8s
[CV] alpha=10 ........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV] ........ alpha=10, score=(train=0.876, test=0.872), total=   3.2s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.4s remaining:    0.0s


[CV] ......... alpha=1, score=(train=0.899, test=0.882), total=   3.1s
[CV] alpha=1 .........................................................
[CV] ......... alpha=1, score=(train=0.894, test=0.886), total=   3.1s
[CV] alpha=0.1 .......................................................
[CV] ....... alpha=0.1, score=(train=0.911, test=0.893), total=   3.2s
[CV] alpha=0.1 .......................................................
[CV] ....... alpha=0.1, score=(train=0.908, test=0.895), total=   3.1s
[CV] alpha=0.001 .....................................................
[CV] ..... alpha=0.001, score=(train=0.921, test=0.898), total=   3.1s
[CV] alpha=0.001 .....................................................
[CV] ..... alpha=0.001, score=(train=0.919, test=0.899), total=   3.2s
[CV] alpha=0.0001 ....................................................
[CV] .... alpha=0.0001, score=(train=0.923, test=0.897), total=   3.3s
[CV] alpha=0.0001 ....................................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   54.2s finished


GridSearchCV(cv=2, error_score=nan,
             estimator=MultinomialNB(alpha=0.001, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [10, 1, 0.1, 0.001, 0.0001, 1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=3)

In [19]:
mnb_best_alpha = mnb_best.best_estimator_.get_params()['alpha']
print(f'Best alpha: {mnb_best_alpha}')
mnb_best.best_params_

Best alpha: 0.001


{'alpha': 0.001}

In [20]:
pred = mnb_best.predict(X_test)
pred_train = mnb_best.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

Training set Accuracy:  91.1825
Test set Accuracy:  90.25999999999999
Confusion Matrix:


Unnamed: 0,0,1
0,4458,447
1,527,4568


### <h2> <center>Random Forest Classifier</center></h2>

In [21]:
# takes a lot of time ...
rf = RandomForestClassifier(max_depth = 100, 
                            n_estimators = 500,
                            min_samples_leaf = 5,
                            min_samples_split = 2,
                            max_features = 'sqrt',
                            random_state = 7)  # found from Random Search

rf.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [0]:
pred = rf.predict(X_test)
pred_train = rf.predict(X_train)
print("Training set Accuracy: ", accuracy_score(pred_train, y_train)*100)
print("Test set Accuracy: ", accuracy_score(pred, y_test)*100)

print("Confusion Matrix:")
pd.DataFrame(confusion_matrix(pred, y_test))

In [12]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 3)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 3)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

hyperparam = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }
pprint(hyperparam)
rf_temp = RandomForestClassifier()
rf_random = RandomizedSearchCV(rf_temp, hyperparam, cv = 2, n_iter = 10, verbose = 2,
                        refit = False, return_train_score = True)

{'max_depth': [10, 55, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 550, 1000]}


In [9]:
# takes a lot of time ...
# rf_random.fit(X_train, y_train)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=500 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=500, total=  14.0s
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=500 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s remaining:    0.0s


[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=500, total=  14.0s
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=700 
[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=700, total=  20.2s
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=700 
[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=700, total=  19.6s
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1000, total=  29.2s
[CV] max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=100, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1000, total=  28.7s
[CV] max_depth=150, max_features=sqrt, 

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  4.4min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [0]:
pprint(rf_grid.best_params_)

{'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 500}
