# Module and Data Importation

In [1]:
import pandas as pd
import pickle
import numpy as np
from scipy.sparse import hstack


## Utilities

In [2]:
import time
import matplotlib.pyplot as plt

## Helper functions:

In [3]:
def accuracy_calculator(confuse):
    accuracy = 0
    tp = confuse[1][1]
    fp = confuse[0][1]
    fn = confuse[1][0]
    tn = confuse[0][0]
    
    accuracy = (tp+tn) / (fn+fp)
    return accuracy

In [4]:
def precision_calculator(confuse):
    precision = 0
    print(confuse)
    tp = confuse[1][1]
    fp = confuse[0][1]
    if tp > 0 or tp ==1:
        precision = tp / (tp+fp)
    return precision

In [5]:
def recall_calculator(confuse):
    recall = 0
    tp = confuse[1][1]
    fn = confuse[1][0]
    if tp > 0 or tp ==1:
        recall = tp / (tp+fn)
    return recall

# Load our results

In [6]:
columns_list = ['pos_only','neg_only','high_ratio','topics','training_time','accuracy','precision', 'recall','ROC_AUC_Score']

In [7]:
df = pd.DataFrame(columns=columns_list)

In [8]:
pos_only_list= [100,50,100,50,50,50,25]
neg_only_list= [50,100,100,100,50,50,50]
high_ratio_list = [400,400,300,300,300,250,275]
topic_list= [50,50,100,150,200,250,250]

grid_space = {'pos_only' : pos_only_list,
             'neg_only':neg_only_list,
             'high_ratio':high_ratio_list,
            'topics':topic_list}
              
for i in range(len(grid_space['pos_only'])):
    num_topics = grid_space['topics'][i]  
    pos_only = grid_space['pos_only'][i]
    neg_only = grid_space['neg_only'][i]
    high_ratio = grid_space['high_ratio'][i]
    
    results_dict= pickle.load(
                    open(
                        'model_stats' +'_pos_' + str(pos_only) +'_neg_' + str(neg_only) + '_ratio_' + str(high_ratio) + '_topcis_' +str(num_topics) +'.p',
                        "rb" ) )
    results_dict['GradientBoostingClassifier']['topics'] = num_topics
    results_dict['GradientBoostingClassifier']['pos_only'] = pos_only
    results_dict['GradientBoostingClassifier']['neg_only'] = neg_only
    results_dict['GradientBoostingClassifier']['high_ratio'] = high_ratio
    
    print(results_dict['GradientBoostingClassifier']['pos_only'])
    #The three we have to calculate
    results_dict['GradientBoostingClassifier']['precision'] = precision_calculator(results_dict['GradientBoostingClassifier']['confuse'])
    results_dict['GradientBoostingClassifier']['recall'] = recall_calculator(results_dict['GradientBoostingClassifier']['confuse'])
    results_dict['GradientBoostingClassifier']['accuracy'] = accuracy_calculator(results_dict['GradientBoostingClassifier']['confuse'])
    del(results_dict['GradientBoostingClassifier']['confuse'])

    
    results_df = pd.DataFrame(results_dict['GradientBoostingClassifier'],index=[0])
    results_df = results_df
    
    df = pd.concat([df,results_df])
    
                                  

100
[[ 4176 10014]
 [ 2384 74231]]
50
[[ 4143 10039]
 [ 2385 74238]]
100
[[ 3428 10700]
 [ 2413 74264]]
50
[[ 4037 10312]
 [ 2572 73884]]
50
[[ 4124  9998]
 [ 2511 74172]]
50
[[ 4513  9554]
 [ 2496 74242]]
25
[[ 4619  9695]
 [ 2507 73984]]


In [9]:
df = df.reset_index(drop=True)

# Stats Examination

## Overview by evaluation metric

In [10]:
df.sort_values(by='precision', ascending=False)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141
4,50,50,300,200,1158.446701,6.259173,0.881217,0.967255,0.815181
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
3,50,100,300,150,888.333634,6.047889,0.877524,0.96636,0.813027
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629


In [11]:
df.sort_values(by='recall', ascending=False).head(3)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629


In [12]:

df.sort_values(by='accuracy', ascending=False).head(3)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083


In [13]:
df.sort_values(by='ROC_AUC_Score', ascending=False).head(3)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302


In [14]:
df.sort_values(by='training_time').head(3)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629


## Evaluation by input parameterm

In [15]:
df.sort_values(by='pos_only', ascending=False).head(10)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
3,50,100,300,150,888.333634,6.047889,0.877524,0.96636,0.813027
4,50,50,300,200,1158.446701,6.259173,0.881217,0.967255,0.815181
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141


In [16]:
df.sort_values(by='neg_only', ascending=False).head(10)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629
3,50,100,300,150,888.333634,6.047889,0.877524,0.96636,0.813027
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
4,50,50,300,200,1158.446701,6.259173,0.881217,0.967255,0.815181
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141


In [17]:
df.sort_values(by='high_ratio', ascending=False).head(10)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629
3,50,100,300,150,888.333634,6.047889,0.877524,0.96636,0.813027
4,50,50,300,200,1158.446701,6.259173,0.881217,0.967255,0.815181
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302


In [18]:
df.sort_values(by='topics', ascending=False).head(10)

Unnamed: 0,pos_only,neg_only,high_ratio,topics,training_time,accuracy,precision,recall,ROC_AUC_Score
5,50,50,250,250,1437.21523,6.535685,0.885985,0.967474,0.821302
6,25,50,275,250,1434.63588,6.441813,0.884141,0.967225,0.823141
4,50,50,300,200,1158.446701,6.259173,0.881217,0.967255,0.815181
3,50,100,300,150,888.333634,6.047889,0.877524,0.96636,0.813027
2,100,100,300,100,584.105247,5.924807,0.874064,0.96853,0.80629
0,100,50,400,50,307.313548,6.324165,0.881132,0.968883,0.825083
1,50,100,400,50,300.799314,6.308838,0.880881,0.968874,0.820027


# Conclusions

Recall: pos(100), neg(50), ratio(400), topic(50), Precision_Score(0.881132),Recall_Score(0.968883)

precision: pos(50), neg(50), ratio(300), topic(200) Precision_Score(0.881217), Recall_Score(0.967255)

Above are the two primary sets of stats I care about. Though the best pure accuracy evaluation came out with exactly the same input parameters as the recall set.

Questions:

Do the polarity only inputs have a significant impact?
    Negative polarity: Non obvious from these trials
    Positive Polarity: Non obvious from these trials

Do the high_ratio inputs have a significant impact?
    All of our high scoring results have ratio-tokens in the three to four hundreds. This really seems to be the only entity worht spending our dimensionality on.

Does the number of topic inputs have a significant impact?
    I would have expected this to be a positive yes, but all of our most effective models (sans two scored for precision) are all using only 50 topics. I expected topics to be more powerful. Curious if it's just at this low dimensionality they can't keep up with our high ratio words? Unclear. Much more tesing required.

