In [1]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

from Load_Predictions import *
from downstream_models import *
from Featurize import *
from Train_Test_Random_Forest import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dict_label_true = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

# y_true = [dict_label_true[str(i)] for i in y_true]

In [3]:
table_column_names = ['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
           '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
           'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
           'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
           'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
           'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
           'is_list', 'is_long_sentence', 
           'sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5',
           'sample_6', 'sample_7', 'sample_8', 'sample_9', 'sample_10',]

In [4]:
removed_sample_statistics = ['std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count']

In [5]:
sample_size_list = [1,2,3,4,5,10]
rand_seed = 100
results = []

final_results = []

for sample_size in sample_size_list:
    
    RF_results = []
    
    xtrain = pd.read_csv(f"Sampled-Benchmark-Labeled-Data/{sample_size}_sample_data_train.csv")
    xtest = pd.read_csv(f"Sampled-Benchmark-Labeled-Data/{sample_size}_sample_data_test.csv")
    

    y_true, RF_results = Train_Test_Random_Forest(xtrain, xtest, truncated=True)
    
    print(y_true)
    print(RF_results)
    
    results = pd.DataFrame(
    {'actual': y_true['y_act'].values,
     'predicted': RF_results
    })
    
    cmat = confusion_matrix(results['actual'], results['predicted'])
    accuracies = []

    for key, value in dict_label_true.items(): 
        TP=0
        FP=0
        TN=0
        FN=0
        for i in range(0,9):
            for j in range(0,9):
                total_actual = sum(cmat[i])
                if (i == value & i == j):
                    TP = cmat[i][j]
                    FP = total_actual-TP
                if i != value:
                    TN += cmat[i][j]
                if (i != value) & (j == value):
                    FN += cmat[i][j]

        print(TP, FP, TN, FN)
        accuracy = (TP+(TN-FN))/(TP+FP+FN+(TN-FN))
        print(accuracy)
        accuracies.append(accuracy)
        
    class_accuracy = list(zip(dict_label_true.keys(), accuracies))
    class_accuracy = pd.DataFrame(class_accuracy).set_index(0).T
    class_accuracy.index = ['accuracy']
    
    classification_report_df = pd.DataFrame(classification_report(results['actual'], results['predicted'], target_names=dict_label_true.keys(), output_dict=True))
    classification_report_df = classification_report_df.loc[['precision', 'recall', 'f1-score']].iloc[: , :-3]
    
    result_df = class_accuracy.append(classification_report_df)
    final_results.append([result_df, cmat])



[n_estimator: 5, max_depth: 5, accuracy: 0.5122873345935728]
[n_estimator: 5, max_depth: 10, accuracy: 0.5834908632640201]
[n_estimator: 5, max_depth: 25, accuracy: 0.7403906742281033]
[n_estimator: 5, max_depth: 50, accuracy: 0.7908002520478891]
[n_estimator: 5, max_depth: 100, accuracy: 0.7958412098298677]
[n_estimator: 5, max_depth: 250, accuracy: 0.7952110901071203]
[n_estimator: 25, max_depth: 5, accuracy: 0.5293005671077504]
[n_estimator: 25, max_depth: 10, accuracy: 0.6868304977945809]
[n_estimator: 25, max_depth: 25, accuracy: 0.7914303717706365]
[n_estimator: 25, max_depth: 50, accuracy: 0.8185255198487713]
[n_estimator: 25, max_depth: 100, accuracy: 0.8229363579080026]
[n_estimator: 25, max_depth: 250, accuracy: 0.8298676748582231]
[n_estimator: 50, max_depth: 5, accuracy: 0.5122873345935728]
[n_estimator: 50, max_depth: 10, accuracy: 0.6906112161310649]
[n_estimator: 50, max_depth: 25, accuracy: 0.8034026465028355]
[n_estimator: 50, max_depth: 50, accuracy: 0.822936357908002

[n_estimator: 50, max_depth: 25, accuracy: 0.7909319899244333]
[n_estimator: 50, max_depth: 50, accuracy: 0.8243073047858942]
[n_estimator: 50, max_depth: 100, accuracy: 0.8350125944584383]
[n_estimator: 50, max_depth: 250, accuracy: 0.8343828715365239]
[n_estimator: 75, max_depth: 5, accuracy: 0.5906801007556675]
[n_estimator: 75, max_depth: 10, accuracy: 0.6630982367758187]
[n_estimator: 75, max_depth: 25, accuracy: 0.7940806045340051]
[n_estimator: 75, max_depth: 50, accuracy: 0.8243073047858942]
[n_estimator: 75, max_depth: 100, accuracy: 0.8381612090680101]
[n_estimator: 75, max_depth: 250, accuracy: 0.8394206549118388]
[n_estimator: 100, max_depth: 5, accuracy: 0.5661209068010076]
[n_estimator: 100, max_depth: 10, accuracy: 0.6706549118387909]
[n_estimator: 100, max_depth: 25, accuracy: 0.7965994962216625]
[n_estimator: 100, max_depth: 50, accuracy: 0.8287153652392947]
[n_estimator: 100, max_depth: 100, accuracy: 0.836272040302267]
[n_estimator: 100, max_depth: 250, accuracy: 0.8



[n_estimator: 5, max_depth: 5, accuracy: 0.448503937007874]
[n_estimator: 5, max_depth: 10, accuracy: 0.6]
[n_estimator: 5, max_depth: 25, accuracy: 0.7946456692913386]
[n_estimator: 5, max_depth: 50, accuracy: 0.8560629921259842]
[n_estimator: 5, max_depth: 100, accuracy: 0.8503937007874016]
[n_estimator: 5, max_depth: 250, accuracy: 0.8614173228346457]
[n_estimator: 25, max_depth: 5, accuracy: 0.5675590551181102]
[n_estimator: 25, max_depth: 10, accuracy: 0.6755905511811023]
[n_estimator: 25, max_depth: 25, accuracy: 0.8277165354330709]
[n_estimator: 25, max_depth: 50, accuracy: 0.8856692913385826]
[n_estimator: 25, max_depth: 100, accuracy: 0.8944881889763779]
[n_estimator: 25, max_depth: 250, accuracy: 0.9007874015748032]
[n_estimator: 50, max_depth: 5, accuracy: 0.5637795275590551]
[n_estimator: 50, max_depth: 10, accuracy: 0.6831496062992126]
[n_estimator: 50, max_depth: 25, accuracy: 0.8277165354330709]
[n_estimator: 50, max_depth: 50, accuracy: 0.8853543307086614]
[n_estimator:

[n_estimator: 50, max_depth: 25, accuracy: 0.8226771653543307]
[n_estimator: 50, max_depth: 50, accuracy: 0.8762204724409449]
[n_estimator: 50, max_depth: 100, accuracy: 0.8913385826771654]
[n_estimator: 50, max_depth: 250, accuracy: 0.8938582677165354]
[n_estimator: 75, max_depth: 5, accuracy: 0.5908661417322835]
[n_estimator: 75, max_depth: 10, accuracy: 0.68]
[n_estimator: 75, max_depth: 25, accuracy: 0.822992125984252]
[n_estimator: 75, max_depth: 50, accuracy: 0.87748031496063]
[n_estimator: 75, max_depth: 100, accuracy: 0.8910236220472441]
[n_estimator: 75, max_depth: 250, accuracy: 0.8919685039370079]
[n_estimator: 100, max_depth: 5, accuracy: 0.5921259842519685]
[n_estimator: 100, max_depth: 10, accuracy: 0.6834645669291338]
[n_estimator: 100, max_depth: 25, accuracy: 0.8270866141732284]
[n_estimator: 100, max_depth: 50, accuracy: 0.8806299212598425]
[n_estimator: 100, max_depth: 100, accuracy: 0.8925984251968504]
[n_estimator: 100, max_depth: 250, accuracy: 0.8910236220472441]



[n_estimator: 5, max_depth: 5, accuracy: 0.4260814783704326]
[n_estimator: 5, max_depth: 10, accuracy: 0.625577488450231]
[n_estimator: 5, max_depth: 25, accuracy: 0.8170936581268374]
[n_estimator: 5, max_depth: 50, accuracy: 0.8859722805543889]
[n_estimator: 5, max_depth: 100, accuracy: 0.8926921461570768]
[n_estimator: 5, max_depth: 250, accuracy: 0.8950020999580008]
[n_estimator: 25, max_depth: 5, accuracy: 0.5856782864342713]
[n_estimator: 25, max_depth: 10, accuracy: 0.6852162956740865]
[n_estimator: 25, max_depth: 25, accuracy: 0.8485930281394373]
[n_estimator: 25, max_depth: 50, accuracy: 0.9115917681646367]
[n_estimator: 25, max_depth: 100, accuracy: 0.9216715665686687]
[n_estimator: 25, max_depth: 250, accuracy: 0.9237715245695086]
[n_estimator: 50, max_depth: 5, accuracy: 0.5839983200335993]
[n_estimator: 50, max_depth: 10, accuracy: 0.6898362032759345]
[n_estimator: 50, max_depth: 25, accuracy: 0.8534229315413692]
[n_estimator: 50, max_depth: 50, accuracy: 0.9120117597648048

[n_estimator: 50, max_depth: 25, accuracy: 0.8485930281394373]
[n_estimator: 50, max_depth: 50, accuracy: 0.9130617387652247]
[n_estimator: 50, max_depth: 100, accuracy: 0.9210415791684167]
[n_estimator: 50, max_depth: 250, accuracy: 0.9229315413691727]
[n_estimator: 75, max_depth: 5, accuracy: 0.5949181016379672]
[n_estimator: 75, max_depth: 10, accuracy: 0.6864762704745905]
[n_estimator: 75, max_depth: 25, accuracy: 0.8509029819403612]
[n_estimator: 75, max_depth: 50, accuracy: 0.9141117177656447]
[n_estimator: 75, max_depth: 100, accuracy: 0.9223015539689207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9248215035699286]
[n_estimator: 100, max_depth: 5, accuracy: 0.5949181016379672]
[n_estimator: 100, max_depth: 10, accuracy: 0.6864762704745905]
[n_estimator: 100, max_depth: 25, accuracy: 0.8511129777404451]
[n_estimator: 100, max_depth: 50, accuracy: 0.9134817303653927]
[n_estimator: 100, max_depth: 100, accuracy: 0.9231415371692566]
[n_estimator: 100, max_depth: 250, accuracy: 0.

  interactivity=interactivity, compiler=compiler, result=result)


[n_estimator: 5, max_depth: 5, accuracy: 0.527642148369822]
[n_estimator: 5, max_depth: 10, accuracy: 0.6711293117026303]
[n_estimator: 5, max_depth: 25, accuracy: 0.8248543077650023]
[n_estimator: 5, max_depth: 50, accuracy: 0.8979366829421956]
[n_estimator: 5, max_depth: 100, accuracy: 0.9204599149472358]
[n_estimator: 5, max_depth: 250, accuracy: 0.9160497716175776]
[n_estimator: 25, max_depth: 5, accuracy: 0.5873365884391243]
[n_estimator: 25, max_depth: 10, accuracy: 0.6879823594266814]
[n_estimator: 25, max_depth: 25, accuracy: 0.8561978264293589]
[n_estimator: 25, max_depth: 50, accuracy: 0.9220349661363995]
[n_estimator: 25, max_depth: 100, accuracy: 0.9360529217199559]
[n_estimator: 25, max_depth: 250, accuracy: 0.9363679319577887]
[n_estimator: 50, max_depth: 5, accuracy: 0.5810363836824697]
[n_estimator: 50, max_depth: 10, accuracy: 0.6920774925185068]
[n_estimator: 50, max_depth: 25, accuracy: 0.8628130414238463]
[n_estimator: 50, max_depth: 50, accuracy: 0.9223499763742322

[n_estimator: 50, max_depth: 25, accuracy: 0.864860607969759]
[n_estimator: 50, max_depth: 50, accuracy: 0.9303827374389667]
[n_estimator: 50, max_depth: 100, accuracy: 0.9414080957631124]
[n_estimator: 50, max_depth: 250, accuracy: 0.9407780752874468]
[n_estimator: 75, max_depth: 5, accuracy: 0.5884391242715388]
[n_estimator: 75, max_depth: 10, accuracy: 0.7015277996534888]
[n_estimator: 75, max_depth: 25, accuracy: 0.8676957001102535]
[n_estimator: 75, max_depth: 50, accuracy: 0.9316427783902976]
[n_estimator: 75, max_depth: 100, accuracy: 0.941723106000945]
[n_estimator: 75, max_depth: 250, accuracy: 0.9429831469522759]
[n_estimator: 100, max_depth: 5, accuracy: 0.581193888801386]
[n_estimator: 100, max_depth: 10, accuracy: 0.7038903764372342]
[n_estimator: 100, max_depth: 25, accuracy: 0.8703732871318318]
[n_estimator: 100, max_depth: 50, accuracy: 0.9316427783902976]
[n_estimator: 100, max_depth: 100, accuracy: 0.9423531264766105]
[n_estimator: 100, max_depth: 250, accuracy: 0.944

  interactivity=interactivity, compiler=compiler, result=result)


[n_estimator: 5, max_depth: 5, accuracy: 0.5840473790322581]
[n_estimator: 5, max_depth: 10, accuracy: 0.6446572580645161]
[n_estimator: 5, max_depth: 25, accuracy: 0.8181703629032258]
[n_estimator: 5, max_depth: 50, accuracy: 0.9213709677419355]
[n_estimator: 5, max_depth: 100, accuracy: 0.9319556451612904]
[n_estimator: 5, max_depth: 250, accuracy: 0.9291834677419355]
[n_estimator: 25, max_depth: 5, accuracy: 0.5995463709677419]
[n_estimator: 25, max_depth: 10, accuracy: 0.6731350806451613]
[n_estimator: 25, max_depth: 25, accuracy: 0.8608870967741935]
[n_estimator: 25, max_depth: 50, accuracy: 0.9340977822580645]
[n_estimator: 25, max_depth: 100, accuracy: 0.9488407258064516]
[n_estimator: 25, max_depth: 250, accuracy: 0.9501008064516129]
[n_estimator: 50, max_depth: 5, accuracy: 0.5811491935483871]
[n_estimator: 50, max_depth: 10, accuracy: 0.6950604838709677]
[n_estimator: 50, max_depth: 25, accuracy: 0.8623991935483871]
[n_estimator: 50, max_depth: 50, accuracy: 0.936491935483871

[n_estimator: 50, max_depth: 25, accuracy: 0.8694556451612904]
[n_estimator: 50, max_depth: 50, accuracy: 0.9351058467741935]
[n_estimator: 50, max_depth: 100, accuracy: 0.9474546370967742]
[n_estimator: 50, max_depth: 250, accuracy: 0.9478326612903226]
[n_estimator: 75, max_depth: 5, accuracy: 0.6049647177419355]
[n_estimator: 75, max_depth: 10, accuracy: 0.6873739919354839]
[n_estimator: 75, max_depth: 25, accuracy: 0.8712197580645161]
[n_estimator: 75, max_depth: 50, accuracy: 0.9369959677419355]
[n_estimator: 75, max_depth: 100, accuracy: 0.9478326612903226]
[n_estimator: 75, max_depth: 250, accuracy: 0.9493447580645161]
[n_estimator: 100, max_depth: 5, accuracy: 0.6096270161290323]
[n_estimator: 100, max_depth: 10, accuracy: 0.6896421370967742]
[n_estimator: 100, max_depth: 25, accuracy: 0.8727318548387096]
[n_estimator: 100, max_depth: 50, accuracy: 0.9368699596774194]
[n_estimator: 100, max_depth: 100, accuracy: 0.9484627016129032]
[n_estimator: 100, max_depth: 250, accuracy: 0.



[n_estimator: 5, max_depth: 5, accuracy: 0.59294265910523]
[n_estimator: 5, max_depth: 10, accuracy: 0.6742281033396346]
[n_estimator: 5, max_depth: 25, accuracy: 0.7933207309388783]
[n_estimator: 5, max_depth: 50, accuracy: 0.8223062381852552]
[n_estimator: 5, max_depth: 100, accuracy: 0.8267170762444864]
[n_estimator: 5, max_depth: 250, accuracy: 0.833648393194707]
[n_estimator: 25, max_depth: 5, accuracy: 0.6244486452425961]
[n_estimator: 25, max_depth: 10, accuracy: 0.7246376811594203]
[n_estimator: 25, max_depth: 25, accuracy: 0.8399495904221802]
[n_estimator: 25, max_depth: 50, accuracy: 0.8632640201638311]
[n_estimator: 25, max_depth: 100, accuracy: 0.872715816005041]
[n_estimator: 25, max_depth: 250, accuracy: 0.8676748582230623]
[n_estimator: 50, max_depth: 5, accuracy: 0.6156269691241336]
[n_estimator: 50, max_depth: 10, accuracy: 0.7340894770006301]
[n_estimator: 50, max_depth: 25, accuracy: 0.8292375551354757]
[n_estimator: 50, max_depth: 50, accuracy: 0.8645242596093258]
[

[n_estimator: 50, max_depth: 25, accuracy: 0.8261964735516373]
[n_estimator: 50, max_depth: 50, accuracy: 0.8595717884130982]
[n_estimator: 50, max_depth: 100, accuracy: 0.8589420654911839]
[n_estimator: 50, max_depth: 250, accuracy: 0.8589420654911839]
[n_estimator: 75, max_depth: 5, accuracy: 0.6190176322418136]
[n_estimator: 75, max_depth: 10, accuracy: 0.7304785894206549]
[n_estimator: 75, max_depth: 25, accuracy: 0.820528967254408]
[n_estimator: 75, max_depth: 50, accuracy: 0.8570528967254408]
[n_estimator: 75, max_depth: 100, accuracy: 0.860831234256927]
[n_estimator: 75, max_depth: 250, accuracy: 0.8602015113350125]
[n_estimator: 100, max_depth: 5, accuracy: 0.6064231738035264]
[n_estimator: 100, max_depth: 10, accuracy: 0.7311083123425692]
[n_estimator: 100, max_depth: 25, accuracy: 0.8249370277078085]
[n_estimator: 100, max_depth: 50, accuracy: 0.8551637279596978]
[n_estimator: 100, max_depth: 100, accuracy: 0.8602015113350125]
[n_estimator: 100, max_depth: 250, accuracy: 0.86

In [6]:
sample_size_results_list = []

for i in range(len(sample_size_list)):
    results_df = final_results[i][0]
    
    results_df = pd.DataFrame(results_df.T.stack()).rename({0: f"Sample Size: {sample_size_list[i]}"}, axis=1)
    results_df.index = results_df.index.set_names(['Feature Type', 'Metric'])
    
    sample_size_results_list.append(results_df)

In [7]:
benchmark_df = pd.concat(sample_size_results_list, axis=1)
benchmark_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Sameple Size: 1,Sameple Size: 2,Sameple Size: 3,Sameple Size: 4,Sameple Size: 5,Sameple Size: 10
Feature Type,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
numeric,accuracy,0.951134,0.957179,0.961713,0.96272,0.961511,0.961713
numeric,precision,0.906667,0.919703,0.925393,0.92655,0.923449,0.920107
numeric,recall,0.96181,0.963932,0.970769,0.972419,0.97256,0.977369
numeric,f1-score,0.933425,0.941298,0.947538,0.94893,0.947368,0.947874
categorical,accuracy,0.911839,0.922418,0.92309,0.923426,0.924937,0.921914
categorical,precision,0.773256,0.804217,0.799344,0.805611,0.80483,0.802
categorical,recall,0.873085,0.876368,0.889132,0.87965,0.889716,0.877462
categorical,f1-score,0.820144,0.838743,0.841851,0.841004,0.845147,0.838036
datetime,accuracy,0.992947,0.995466,0.996641,0.996474,0.997582,0.996474
datetime,precision,0.956835,0.958333,0.967517,0.971831,0.982979,0.971831


In [8]:
benchmark_df.to_latex('Retrained Truncated Sample Size Benchmark.tex')

In [11]:
benchmark_df.to_csv('Retrained Truncated Sample Size Benchmark.csv')

In [10]:
for i in range(len(sample_size_list)):
    cmat_df = final_results[i][1]
    accuracy = np.trace(cmat_df) / np.sum(cmat_df)
    print(accuracy)

0.8357682619647355
0.8508816120906801
0.8609571788413098
0.8604534005037784
0.8615617128463476
0.8634760705289672
