Load corpus

In [8]:
import glob
import os
corpus_path = 'C:/Users/Rik/Documents/corpus/'
reviews, labels = [], []
def read_files_for_folder(folder_path, is_spam):
    file_path = corpus_path + 'movie_review_sentiment/' + folder_path
    for filename in glob.glob(os.path.join(file_path, '*.txt')):
        with open(filename, 'r', encoding="ISO-8859-1") as infile:
            reviews.append(infile.read())
            labels.append(is_spam)
read_files_for_folder('neg', 0)
print(len(reviews))
print(len(labels))
read_files_for_folder('pos', 1)
print(len(reviews))
print(len(labels))

1000
1000
2000
2000


Load spacy

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

Clean reviews

In [10]:
def clean_review(review):
    doc = nlp(review)
    return " ".join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop])

In [11]:
cleaned_reviews = [
    clean_review(review) for review in reviews
]

Initialize a 10-fold generator to divide data into chunks with preserved class fractions

In [12]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

k = 10
k_fold = StratifiedKFold(n_splits=k, random_state=42)
cleaned_reviews_np = np.array(cleaned_reviews)
labels_np = np.array(labels)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
def run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option):
    auc_record = []
    for train_indices, test_indices in k_fold.split(cleaned_reviews, labels):
        X_train, X_test = cleaned_reviews_np[train_indices], cleaned_reviews_np[test_indices]
        Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
        for max_features in max_features_option:
            for min_df in min_df_option:
                for max_df in max_df_option:
                    cv = CountVectorizer(stop_words='english', max_features = max_features, max_df = max_df, min_df=min_df)
                    term_docs_train = cv.fit_transform(X_train)
                    term_docs_test = cv.transform(X_test)
                    for alpha in smoothing_factor_option:
                        for fit_prior in fit_prior_option:
                            clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
                            clf.fit(term_docs_train, Y_train)
                            prediction_prob = clf.predict_proba(term_docs_test)
                            pos_prob = prediction_prob[:, 1]
                            auc = roc_auc_score(Y_test, pos_prob)
                            auc_record.append({
                                'max_features': max_features,
                                'min_df': min_df,
                                'max_df': max_df,
                                'smoothing': alpha,
                                'fit_prior': fit_prior,
                                'auc': auc
                            })
    return auc_record


In [14]:
max_features_option = [2000, 8000, None]
smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
fit_prior_option = [True, False]
min_df_option = [1, 2, 3, 4]
max_df_option = [0.4, 0.5, 0.6, 0.7]
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option)

In [15]:
import pandas as pd

def print_results(auc_record):
    print('max_features \t| min_df \t| max_df \t| smoothing \t| fit_prior \t| auc')
    for auc_run in sorted([
        {
            'max_features': key[0],
            'min_df': key[1],
            'max_df': key[2],
            'smoothing': key[3],
            'fit_prior': key[4],
            'auc': np.mean(group['auc'])
        }
        for key, group in pd.DataFrame(auc_record).fillna(-1).groupby(['max_features', 'min_df', 'max_df', 'smoothing', 'fit_prior'])
    ], key = lambda auc_run: auc_run['auc'], reverse = True):
        print(auc_run['max_features'], '\t\t|', auc_run['min_df'], '\t\t|', auc_run['max_df'], '\t\t|', auc_run['smoothing'], '\t\t|', auc_run['fit_prior'], '\t|', auc_run['auc'])
    
print_results(auc_record)

max_features 	| min_df 	| max_df 	| smoothing 	| fit_prior 	| auc
-1.0 		| 4 		| 0.5 		| 4.0 		| False 	| 0.8879199999999999
-1.0 		| 4 		| 0.5 		| 4.0 		| True 	| 0.8879199999999999
-1.0 		| 4 		| 0.7 		| 4.0 		| False 	| 0.88749
-1.0 		| 4 		| 0.7 		| 4.0 		| True 	| 0.88749
-1.0 		| 4 		| 0.6 		| 4.0 		| False 	| 0.8874650000000001
-1.0 		| 4 		| 0.6 		| 4.0 		| True 	| 0.8874650000000001
8000.0 		| 4 		| 0.5 		| 4.0 		| False 	| 0.8862499999999999
8000.0 		| 4 		| 0.5 		| 4.0 		| True 	| 0.8862499999999999
-1.0 		| 3 		| 0.5 		| 4.0 		| False 	| 0.88576
-1.0 		| 3 		| 0.5 		| 4.0 		| True 	| 0.88576
8000.0 		| 4 		| 0.7 		| 4.0 		| False 	| 0.885675
8000.0 		| 4 		| 0.7 		| 4.0 		| True 	| 0.885675
8000.0 		| 4 		| 0.6 		| 4.0 		| False 	| 0.8855999999999999
8000.0 		| 4 		| 0.6 		| 4.0 		| True 	| 0.8855999999999999
-1.0 		| 4 		| 0.5 		| 2.0 		| False 	| 0.8854349999999999
-1.0 		| 4 		| 0.5 		| 2.0 		| True 	| 0.8854349999999999
-1.0 		| 3 		| 0.7 		| 4.0 		| False 	| 0.88526500

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 0.87902
-1.0 		| 3 		| 0.7 		| 1.0 		| False 	| 0.87895
-1.0 		| 3 		| 0.7 		| 1.0 		| True 	| 0.87895
-1.0 		| 3 		| 0.6 		| 1.0 		| False 	| 0.878935
-1.0 		| 3 		| 0.6 		| 1.0 		| True 	| 0.878935
-1.0 		| 4 		| 0.4 		| 1.0 		| False 	| 0.87888
-1.0 		| 4 		| 0.4 		| 1.0 		| True 	| 0.87888
8000.0 		| 2 		| 0.4 		| 4.0 		| False 	| 0.8785299999999999
8000.0 		| 2 		| 0.4 		| 4.0 		| True 	| 0.8785299999999999
8000.0 		| 1 		| 0.4 		| 4.0 		| False 	| 0.878275
8000.0 		| 1 		| 0.4 		| 4.0 		| True 	| 0.878275
8000.0 		| 3 		| 0.6 		| 1.0 		| False 	| 0.87823
8000.0 		| 3 		| 0.6 		| 1.0 		| True 	| 0.87823
8000.0 		| 3 		| 0.7 		| 1.0 		| False 	| 0.87812
8000.0 		| 3 		| 0.7 		| 1.0 		| True 	| 0.87812
-1.0 		| 3 		| 0.4 		| 2.0 		| False 	| 0.878055
-1.0 		| 3 		| 0.4 		| 2.0 		| True 	| 0.878055
-1.0 		| 2 		| 0.5 		| 1.0 		| False 	| 0.877845
-1.0 		| 2 		| 0.5 		| 1.0 		| True 	| 0.877845
8000.0 		| 3 		| 0.4 		| 2.0 		| False 	| 0.87773
8000.0 		| 3 		| 0.4 		| 2.0 		| True 	|

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [19]:
max_features_option = [None]
smoothing_factor_option = [20.0, 30.0, 40.0, 50.0]
fit_prior_option = [True, False]
min_df_option = [5, 6]
max_df_option = [0.5]
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option)
print_results(auc_record)

max_features 	| min_df 	| max_df 	| smoothing 	| fit_prior 	| auc
-1.0 		| 6 		| 0.5 		| 20.0 		| False 	| 0.893205
-1.0 		| 6 		| 0.5 		| 20.0 		| True 	| 0.893205
-1.0 		| 5 		| 0.5 		| 20.0 		| False 	| 0.8931799999999999
-1.0 		| 5 		| 0.5 		| 20.0 		| True 	| 0.8931799999999999
-1.0 		| 6 		| 0.5 		| 30.0 		| False 	| 0.8931350000000002
-1.0 		| 6 		| 0.5 		| 30.0 		| True 	| 0.8931350000000002
-1.0 		| 5 		| 0.5 		| 30.0 		| False 	| 0.893075
-1.0 		| 5 		| 0.5 		| 30.0 		| True 	| 0.893075
-1.0 		| 6 		| 0.5 		| 40.0 		| False 	| 0.8924949999999999
-1.0 		| 6 		| 0.5 		| 40.0 		| True 	| 0.8924949999999999
-1.0 		| 5 		| 0.5 		| 40.0 		| False 	| 0.892205
-1.0 		| 5 		| 0.5 		| 40.0 		| True 	| 0.892205
-1.0 		| 6 		| 0.5 		| 50.0 		| False 	| 0.89111
-1.0 		| 6 		| 0.5 		| 50.0 		| True 	| 0.89111
-1.0 		| 5 		| 0.5 		| 50.0 		| False 	| 0.8906899999999999
-1.0 		| 5 		| 0.5 		| 50.0 		| True 	| 0.8906899999999999
1200.0 		| 6 		| 0.5 		| 20.0 		| False 	| 0.8765050000000001
12