In [3]:
import glob, os
import random
import json
import sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

def loadimdb_data(subset='train'):
    if(subset == 'train'):
        pathneg= "./aclImdb/train/neg/"
        pathpos="./aclImdb/train/pos/"
    else:
        pathneg= "./aclImdb/test/neg/"
        pathpos="./aclImdb/test/pos/"
        
    negfiles=[]
    posfiles=[]
 
    for file in glob.glob(pathneg+"*.txt"):
        f = open(file, "r")
        negfiles.append(f.read())
    for file in glob.glob(pathpos+"*.txt"):
        f = open(file, "r")
        posfiles.append(f.read())
        
    neg=list(zip(negfiles,[0]*len(negfiles)))
    pos=list(zip(posfiles,[1]*len(posfiles)))
    sentiments = neg + pos
    
    random.shuffle(sentiments)
    
    dataset = {'data':[], 'target':[]}
    for s in sentiments:
        dataset['data'].append(s[0])
        dataset['target'].append(s[1])
        
    return np.array(dataset['data']), np.array(dataset['target'])

def build_pipeline_IMDB(model = LogisticRegression(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def run_pipeline_IMDB(train_data, train_target, model=LogisticRegression(random_state=0),
                 gridsearch = False,
                 params ={'clf__criterion':['gini','entropy'],
                          'clf__n_estimators':[100,200,400,800]
                         }):
     
    
    pl = build_pipeline_IMDB(model)
    pl.fit(train_data, train_target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(train_data, train_target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [4]:
train_data, train_target = loadimdb_data(subset='train')
test_data, test_target = loadimdb_data(subset='test')

In [5]:
plainRF = run_pipeline_IMDB(train_data, train_target, model=RandomForestClassifier(), gridsearch=None)
plainRFPred = plainRF.predict(test_data)



In [6]:
bestRF=run_pipeline_IMDB(train_data, train_target, model=RandomForestClassifier(), gridsearch=True)



Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  9.9min finished


SearchCV took 947.39 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.853 (std: 0.007)
Parameters: {'clf__criterion': 'entropy', 'clf__n_estimators': 800}

Model with rank: 2
Mean validation score: 0.851 (std: 0.006)
Parameters: {'clf__criterion': 'entropy', 'clf__n_estimators': 400}

Model with rank: 3
Mean validation score: 0.851 (std: 0.005)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 800}



In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestRFPred = bestRF.predict(test_data)
print("Plain Test Accuracy: ", accuracy_score(plainRFPred, test_target))
print("Best Test Accuracy: ", accuracy_score(bestRFPred, test_target))
print("Plain Train Accuracy: " ,cross_val_score(plainRF, train_data, train_target, cv=3, scoring='accuracy').mean())
print("Best Train Aaccuracy: ", cross_val_score(bestRF, train_data, train_target, cv=3, scoring='accuracy').mean())
bestRF.best_estimator_

Plain Test Accuracy:  0.7254
Best Test Accuracy:  0.85708
Plain Train Accuracy:  0.7332799645831503
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.1min finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.1min finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.1min finished


Best Train Aaccuracy:  0.8530803085441114


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='entropy', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=

In [10]:
print(confusion_matrix(bestRFPred,test_target))
print(confusion_matrix(plainRFPred,test_target))

[[10663  1736]
 [ 1837 10764]]
[[10220  4585]
 [ 2280  7915]]


In [11]:
rf = build_pipeline_IMDB(model=RandomForestClassifier(criterion='entropy', n_estimators = 800))
rf.fit(train_data, train_target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='entropy', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=

In [13]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

array = confusion_matrix(rf.predict(test_data),test_target)
C=array
array=C / C.astype(np.float).sum(axis=0)
df_cm = pd.DataFrame(array, index = data.target_names,
                  columns = data.target_names)
plt.figure(figsize = (20,10))
sn.heatmap(df_cm, annot=True)

NameError: name 'data' is not defined