In [1]:
import glob, os
import random
import json
import sklearn
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

def loadimdb_data(subset='train'):
    if(subset == 'train'):
        pathneg= "./aclImdb/train/neg/"
        pathpos="./aclImdb/train/pos/"
    else:
        pathneg= "./aclImdb/test/neg/"
        pathpos="./aclImdb/test/pos/"
        
    negfiles=[]
    posfiles=[]
 
    for file in glob.glob(pathneg+"*.txt"):
        f = open(file, "r")
        negfiles.append(f.read())
    for file in glob.glob(pathpos+"*.txt"):
        f = open(file, "r")
        posfiles.append(f.read())
        
    neg=list(zip(negfiles,[0]*len(negfiles)))
    pos=list(zip(posfiles,[1]*len(posfiles)))
    sentiments = neg + pos
    
    random.shuffle(sentiments)
    
    dataset = {'data':[], 'target':[]}
    for s in sentiments:
        dataset['data'].append(s[0])
        dataset['target'].append(s[1])
        
    return np.array(dataset['data']), np.array(dataset['target'])

def build_pipeline_IMDB(model = LogisticRegression(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))

In [2]:
train_data, train_target = loadimdb_data(subset='train')
test_data, test_target = loadimdb_data(subset='test')


In [3]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def run_pipeline_IMDB(train_data, train_target, model=LogisticRegression(random_state=0),
                 gridsearch = False,
                 params ={'clf__C': [0.01, 0.05, 0.1, 0.3, 1],
                          'tfidf__use_idf': (True, False),
                          'clf__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
                          'clf__class_weight': ['balanced',None]
                         }
                ):
    
    pl = build_pipeline_IMDB(model)
    pl.fit(train_data, train_target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(train_data, train_target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [4]:
plainLR= run_pipeline_IMDB(train_data, train_target, model=LogisticRegression(), gridsearch=None)
plainLRPred = plainLR.predict(test_data)



In [6]:
bestLR=run_pipeline_IMDB(train_data, train_target, model=LogisticRegression(), gridsearch=True)



Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  9.3min finished


SearchCV took 563.77 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.883 (std: 0.003)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'saga', 'tfidf__use_idf': True}

Model with rank: 2
Mean validation score: 0.883 (std: 0.003)
Parameters: {'clf__C': 1, 'clf__class_weight': None, 'clf__solver': 'saga', 'tfidf__use_idf': True}

Model with rank: 3
Mean validation score: 0.883 (std: 0.003)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'lbfgs', 'tfidf__use_idf': True}

Model with rank: 3
Mean validation score: 0.883 (std: 0.003)
Parameters: {'clf__C': 1, 'clf__class_weight': None, 'clf__solver': 'lbfgs', 'tfidf__use_idf': True}



In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestLRPred = bestLR.predict(test_data)
print("Plain Train Accuracy: " ,cross_val_score(plainLR, train_data, train_target, cv=3, scoring='accuracy').mean())
print("Best Train Aaccuracy: ", cross_val_score(bestLR, train_data, train_target, cv=3, scoring='accuracy').mean())



Plain Train Accuracy:  0.8832800433094493




Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.1min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.1min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.5min finished


Best Train Aaccuracy:  0.8832400465091933
