In [1]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

from Load_Predictions import *
from downstream_models import *
from Featurize import *
from Train_Test_Random_Forest import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dict_label_true = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

# y_true = [dict_label_true[str(i)] for i in y_true]

In [3]:
table_column_names = ['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
           '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
           'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
           'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
           'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
           'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
           'is_list', 'is_long_sentence', 
           'sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5',
           'sample_6', 'sample_7', 'sample_8', 'sample_9', 'sample_10',]

In [4]:
sample_size_list = [1,2,3,4,5,10]
rand_seed = 100
results = []

final_results = []

for sample_size in sample_size_list:
    
    RF_results = []
    
    xtrain = pd.read_csv(f"Sampled-Benchmark-Labeled-Data/{sample_size}_sample_data_train.csv")
    xtest = pd.read_csv(f"Sampled-Benchmark-Labeled-Data/{sample_size}_sample_data_test.csv")

    y_true, RF_results = Train_Test_Random_Forest(xtrain, xtest)
    
    print(y_true)
    print(RF_results)
    
    results = pd.DataFrame(
    {'actual': y_true['y_act'].values,
     'predicted': RF_results
    })
    
    cmat = confusion_matrix(results['actual'], results['predicted'])
    accuracies = []

    for key, value in dict_label_true.items(): 
        TP=0
        FP=0
        TN=0
        FN=0
        for i in range(0,9):
            for j in range(0,9):
                total_actual = sum(cmat[i])
                if (i == value & i == j):
                    TP = cmat[i][j]
                    FP = total_actual-TP
                if i != value:
                    TN += cmat[i][j]
                if (i != value) & (j == value):
                    FN += cmat[i][j]

        print(TP, FP, TN, FN)
        accuracy = (TP+(TN-FN))/(TP+FP+FN+(TN-FN))
        print(accuracy)
        accuracies.append(accuracy)
        
    class_accuracy = list(zip(dict_label_true.keys(), accuracies))
    class_accuracy = pd.DataFrame(class_accuracy).set_index(0).T
    class_accuracy.index = ['accuracy']
    
    classification_report_df = pd.DataFrame(classification_report(results['actual'], results['predicted'], target_names=dict_label_true.keys(), output_dict=True))
    classification_report_df = classification_report_df.loc[['precision', 'recall', 'f1-score']].iloc[: , :-3]
    
    result_df = class_accuracy.append(classification_report_df)
    final_results.append([result_df, cmat])



[n_estimator: 5, max_depth: 5, accuracy: 0.5274102079395085]
[n_estimator: 5, max_depth: 10, accuracy: 0.667296786389414]
[n_estimator: 5, max_depth: 25, accuracy: 0.7756773787019534]
[n_estimator: 5, max_depth: 50, accuracy: 0.794580970384373]
[n_estimator: 5, max_depth: 100, accuracy: 0.8109640831758034]
[n_estimator: 5, max_depth: 250, accuracy: 0.8097038437303088]
[n_estimator: 25, max_depth: 5, accuracy: 0.591052299936988]
[n_estimator: 25, max_depth: 10, accuracy: 0.700693131695022]
[n_estimator: 25, max_depth: 25, accuracy: 0.7977315689981096]
[n_estimator: 25, max_depth: 50, accuracy: 0.8279773156899811]
[n_estimator: 25, max_depth: 100, accuracy: 0.8304977945809704]
[n_estimator: 25, max_depth: 250, accuracy: 0.8311279143037177]
[n_estimator: 50, max_depth: 5, accuracy: 0.594833018273472]
[n_estimator: 50, max_depth: 10, accuracy: 0.7069943289224953]
[n_estimator: 50, max_depth: 25, accuracy: 0.8090737240075614]
[n_estimator: 50, max_depth: 50, accuracy: 0.8241965973534972]
[n

[n_estimator: 50, max_depth: 25, accuracy: 0.8123425692695214]
[n_estimator: 50, max_depth: 50, accuracy: 0.8369017632241813]
[n_estimator: 50, max_depth: 100, accuracy: 0.8400503778337531]
[n_estimator: 50, max_depth: 250, accuracy: 0.8394206549118388]
[n_estimator: 75, max_depth: 5, accuracy: 0.5969773299748111]
[n_estimator: 75, max_depth: 10, accuracy: 0.7052896725440806]
[n_estimator: 75, max_depth: 25, accuracy: 0.8104534005037783]
[n_estimator: 75, max_depth: 50, accuracy: 0.8387909319899244]
[n_estimator: 75, max_depth: 100, accuracy: 0.8425692695214105]
[n_estimator: 75, max_depth: 250, accuracy: 0.8438287153652393]
[n_estimator: 100, max_depth: 5, accuracy: 0.5856423173803527]
[n_estimator: 100, max_depth: 10, accuracy: 0.707808564231738]
[n_estimator: 100, max_depth: 25, accuracy: 0.8104534005037783]
[n_estimator: 100, max_depth: 50, accuracy: 0.8400503778337531]
[n_estimator: 100, max_depth: 100, accuracy: 0.8463476070528967]
[n_estimator: 100, max_depth: 250, accuracy: 0.8



[n_estimator: 5, max_depth: 5, accuracy: 0.5669291338582677]
[n_estimator: 5, max_depth: 10, accuracy: 0.6633070866141733]
[n_estimator: 5, max_depth: 25, accuracy: 0.8069291338582677]
[n_estimator: 5, max_depth: 50, accuracy: 0.8614173228346457]
[n_estimator: 5, max_depth: 100, accuracy: 0.864251968503937]
[n_estimator: 5, max_depth: 250, accuracy: 0.8614173228346457]
[n_estimator: 25, max_depth: 5, accuracy: 0.5319685039370079]
[n_estimator: 25, max_depth: 10, accuracy: 0.7067716535433071]
[n_estimator: 25, max_depth: 25, accuracy: 0.8374803149606299]
[n_estimator: 25, max_depth: 50, accuracy: 0.8850393700787401]
[n_estimator: 25, max_depth: 100, accuracy: 0.897007874015748]
[n_estimator: 25, max_depth: 250, accuracy: 0.897007874015748]
[n_estimator: 50, max_depth: 5, accuracy: 0.5193700787401575]
[n_estimator: 50, max_depth: 10, accuracy: 0.7064566929133859]
[n_estimator: 50, max_depth: 25, accuracy: 0.8377952755905512]
[n_estimator: 50, max_depth: 50, accuracy: 0.8929133858267716]


[n_estimator: 50, max_depth: 25, accuracy: 0.8346456692913385]
[n_estimator: 50, max_depth: 50, accuracy: 0.8815748031496063]
[n_estimator: 50, max_depth: 100, accuracy: 0.8948031496062993]
[n_estimator: 50, max_depth: 250, accuracy: 0.8948031496062993]
[n_estimator: 75, max_depth: 5, accuracy: 0.598740157480315]
[n_estimator: 75, max_depth: 10, accuracy: 0.7159055118110236]
[n_estimator: 75, max_depth: 25, accuracy: 0.8384251968503937]
[n_estimator: 75, max_depth: 50, accuracy: 0.8834645669291339]
[n_estimator: 75, max_depth: 100, accuracy: 0.8929133858267716]
[n_estimator: 75, max_depth: 250, accuracy: 0.8929133858267716]
[n_estimator: 100, max_depth: 5, accuracy: 0.5977952755905512]
[n_estimator: 100, max_depth: 10, accuracy: 0.710236220472441]
[n_estimator: 100, max_depth: 25, accuracy: 0.8384251968503937]
[n_estimator: 100, max_depth: 50, accuracy: 0.8840944881889764]
[n_estimator: 100, max_depth: 100, accuracy: 0.8922834645669291]
[n_estimator: 100, max_depth: 250, accuracy: 0.89



[n_estimator: 5, max_depth: 5, accuracy: 0.5384292314153717]
[n_estimator: 5, max_depth: 10, accuracy: 0.6990760184796304]
[n_estimator: 5, max_depth: 25, accuracy: 0.8284334313313734]
[n_estimator: 5, max_depth: 50, accuracy: 0.8931121377572449]
[n_estimator: 5, max_depth: 100, accuracy: 0.9017219655606887]
[n_estimator: 5, max_depth: 250, accuracy: 0.9031919361612768]
[n_estimator: 25, max_depth: 5, accuracy: 0.5302393952120957]
[n_estimator: 25, max_depth: 10, accuracy: 0.7062158756824863]
[n_estimator: 25, max_depth: 25, accuracy: 0.8628727425451491]
[n_estimator: 25, max_depth: 50, accuracy: 0.9153716925661487]
[n_estimator: 25, max_depth: 100, accuracy: 0.9202015959680806]
[n_estimator: 25, max_depth: 250, accuracy: 0.9218815623687526]
[n_estimator: 50, max_depth: 5, accuracy: 0.5533389332213355]
[n_estimator: 50, max_depth: 10, accuracy: 0.7032759344813104]
[n_estimator: 50, max_depth: 25, accuracy: 0.8597228055438891]
[n_estimator: 50, max_depth: 50, accuracy: 0.918101637967240

[n_estimator: 50, max_depth: 25, accuracy: 0.8513229735405292]
[n_estimator: 50, max_depth: 50, accuracy: 0.9111717765644687]
[n_estimator: 50, max_depth: 100, accuracy: 0.9258714825703486]
[n_estimator: 50, max_depth: 250, accuracy: 0.9246115077698446]
[n_estimator: 75, max_depth: 5, accuracy: 0.5428391432171357]
[n_estimator: 75, max_depth: 10, accuracy: 0.7062158756824863]
[n_estimator: 75, max_depth: 25, accuracy: 0.8519529609407812]
[n_estimator: 75, max_depth: 50, accuracy: 0.9130617387652247]
[n_estimator: 75, max_depth: 100, accuracy: 0.9269214615707686]
[n_estimator: 75, max_depth: 250, accuracy: 0.9244015119697606]
[n_estimator: 100, max_depth: 5, accuracy: 0.5600587988240235]
[n_estimator: 100, max_depth: 10, accuracy: 0.7032759344813104]
[n_estimator: 100, max_depth: 25, accuracy: 0.8565728685426292]
[n_estimator: 100, max_depth: 50, accuracy: 0.9149517009659807]
[n_estimator: 100, max_depth: 100, accuracy: 0.9273414531709366]
[n_estimator: 100, max_depth: 250, accuracy: 0.

  interactivity=interactivity, compiler=compiler, result=result)


[n_estimator: 5, max_depth: 5, accuracy: 0.4933060324460545]
[n_estimator: 5, max_depth: 10, accuracy: 0.6760119703890376]
[n_estimator: 5, max_depth: 25, accuracy: 0.842809891321468]
[n_estimator: 5, max_depth: 50, accuracy: 0.9026618365096866]
[n_estimator: 5, max_depth: 100, accuracy: 0.9072294849582612]
[n_estimator: 5, max_depth: 250, accuracy: 0.9077020003150102]
[n_estimator: 25, max_depth: 5, accuracy: 0.531107260985982]
[n_estimator: 25, max_depth: 10, accuracy: 0.6964876358481651]
[n_estimator: 25, max_depth: 25, accuracy: 0.86785320522917]
[n_estimator: 25, max_depth: 50, accuracy: 0.9267601197038904]
[n_estimator: 25, max_depth: 100, accuracy: 0.9340053551740432]
[n_estimator: 25, max_depth: 250, accuracy: 0.9335328398172941]
[n_estimator: 50, max_depth: 5, accuracy: 0.5558355646558513]
[n_estimator: 50, max_depth: 10, accuracy: 0.6971176563238305]
[n_estimator: 50, max_depth: 25, accuracy: 0.8746259253425737]
[n_estimator: 50, max_depth: 50, accuracy: 0.929595211844385]
[n

[n_estimator: 50, max_depth: 25, accuracy: 0.8773035123641518]
[n_estimator: 50, max_depth: 50, accuracy: 0.9329028193416286]
[n_estimator: 50, max_depth: 100, accuracy: 0.9385730036226178]
[n_estimator: 50, max_depth: 250, accuracy: 0.9431406520711924]
[n_estimator: 75, max_depth: 5, accuracy: 0.5512679162072768]
[n_estimator: 75, max_depth: 10, accuracy: 0.7035753661994015]
[n_estimator: 75, max_depth: 25, accuracy: 0.8754134509371555]
[n_estimator: 75, max_depth: 50, accuracy: 0.9332178295794613]
[n_estimator: 75, max_depth: 100, accuracy: 0.9415656008820287]
[n_estimator: 75, max_depth: 250, accuracy: 0.9447157032603559]
[n_estimator: 100, max_depth: 5, accuracy: 0.5460702472830367]
[n_estimator: 100, max_depth: 10, accuracy: 0.7049929122696488]
[n_estimator: 100, max_depth: 25, accuracy: 0.8732083792723263]
[n_estimator: 100, max_depth: 50, accuracy: 0.9336903449362104]
[n_estimator: 100, max_depth: 100, accuracy: 0.9410930855252796]
[n_estimator: 100, max_depth: 250, accuracy: 0.

  interactivity=interactivity, compiler=compiler, result=result)


[n_estimator: 5, max_depth: 5, accuracy: 0.46333165322580644]
[n_estimator: 5, max_depth: 10, accuracy: 0.6771673387096774]
[n_estimator: 5, max_depth: 25, accuracy: 0.8516885080645161]
[n_estimator: 5, max_depth: 50, accuracy: 0.920866935483871]
[n_estimator: 5, max_depth: 100, accuracy: 0.9311995967741935]
[n_estimator: 5, max_depth: 250, accuracy: 0.9261592741935484]
[n_estimator: 25, max_depth: 5, accuracy: 0.5200352822580645]
[n_estimator: 25, max_depth: 10, accuracy: 0.7106854838709677]
[n_estimator: 25, max_depth: 25, accuracy: 0.872101814516129]
[n_estimator: 25, max_depth: 50, accuracy: 0.9412802419354839]
[n_estimator: 25, max_depth: 100, accuracy: 0.9480846774193549]
[n_estimator: 25, max_depth: 250, accuracy: 0.9477066532258065]
[n_estimator: 50, max_depth: 5, accuracy: 0.5777469758064516]
[n_estimator: 50, max_depth: 10, accuracy: 0.7017389112903226]
[n_estimator: 50, max_depth: 25, accuracy: 0.8786542338709677]
[n_estimator: 50, max_depth: 50, accuracy: 0.9406502016129032

[n_estimator: 50, max_depth: 25, accuracy: 0.883820564516129]
[n_estimator: 50, max_depth: 50, accuracy: 0.9405241935483871]
[n_estimator: 50, max_depth: 100, accuracy: 0.9506048387096774]
[n_estimator: 50, max_depth: 250, accuracy: 0.9504788306451613]
[n_estimator: 75, max_depth: 5, accuracy: 0.6014364919354839]
[n_estimator: 75, max_depth: 10, accuracy: 0.7118195564516129]
[n_estimator: 75, max_depth: 25, accuracy: 0.8858366935483871]
[n_estimator: 75, max_depth: 50, accuracy: 0.9409022177419355]
[n_estimator: 75, max_depth: 100, accuracy: 0.9508568548387096]
[n_estimator: 75, max_depth: 250, accuracy: 0.9506048387096774]
[n_estimator: 100, max_depth: 5, accuracy: 0.5918598790322581]
[n_estimator: 100, max_depth: 10, accuracy: 0.7053931451612904]
[n_estimator: 100, max_depth: 25, accuracy: 0.885710685483871]
[n_estimator: 100, max_depth: 50, accuracy: 0.9409022177419355]
[n_estimator: 100, max_depth: 100, accuracy: 0.9507308467741935]
[n_estimator: 100, max_depth: 250, accuracy: 0.95



[n_estimator: 5, max_depth: 5, accuracy: 0.553875236294896]
[n_estimator: 5, max_depth: 10, accuracy: 0.7347195967233774]
[n_estimator: 5, max_depth: 25, accuracy: 0.8260869565217391]
[n_estimator: 5, max_depth: 50, accuracy: 0.8487712665406427]
[n_estimator: 5, max_depth: 100, accuracy: 0.8323881537492124]
[n_estimator: 5, max_depth: 250, accuracy: 0.8323881537492124]
[n_estimator: 25, max_depth: 5, accuracy: 0.6238185255198487]
[n_estimator: 25, max_depth: 10, accuracy: 0.7567737870195337]
[n_estimator: 25, max_depth: 25, accuracy: 0.8462507876496534]
[n_estimator: 25, max_depth: 50, accuracy: 0.8739760554505356]
[n_estimator: 25, max_depth: 100, accuracy: 0.8664146187775678]
[n_estimator: 25, max_depth: 250, accuracy: 0.8664146187775678]
[n_estimator: 50, max_depth: 5, accuracy: 0.6061751732829238]
[n_estimator: 50, max_depth: 10, accuracy: 0.7586641461877757]
[n_estimator: 50, max_depth: 25, accuracy: 0.8374291115311909]
[n_estimator: 50, max_depth: 50, accuracy: 0.8714555765595463

[n_estimator: 50, max_depth: 25, accuracy: 0.8400503778337531]
[n_estimator: 50, max_depth: 50, accuracy: 0.8620906801007556]
[n_estimator: 50, max_depth: 100, accuracy: 0.8646095717884131]
[n_estimator: 50, max_depth: 250, accuracy: 0.8646095717884131]
[n_estimator: 75, max_depth: 5, accuracy: 0.656801007556675]
[n_estimator: 75, max_depth: 10, accuracy: 0.7588161209068011]
[n_estimator: 75, max_depth: 25, accuracy: 0.8438287153652393]
[n_estimator: 75, max_depth: 50, accuracy: 0.8664987405541562]
[n_estimator: 75, max_depth: 100, accuracy: 0.8658690176322418]
[n_estimator: 75, max_depth: 250, accuracy: 0.8671284634760705]
[n_estimator: 100, max_depth: 5, accuracy: 0.6492443324937027]
[n_estimator: 100, max_depth: 10, accuracy: 0.7600755667506297]
[n_estimator: 100, max_depth: 25, accuracy: 0.843198992443325]
[n_estimator: 100, max_depth: 50, accuracy: 0.8652392947103275]
[n_estimator: 100, max_depth: 100, accuracy: 0.8652392947103275]
[n_estimator: 100, max_depth: 250, accuracy: 0.86

In [5]:
sample_size_results_list = []

for i in range(len(sample_size_list)):
    results_df = final_results[i][0]
    
    results_df = pd.DataFrame(results_df.T.stack()).rename({0: f"Sample Size: {sample_size_list[i]}"}, axis=1)
    results_df.index = results_df.index.set_names(['Feature Type', 'Metric'])
    
    sample_size_results_list.append(results_df)

In [6]:
benchmark_df = pd.concat(sample_size_results_list, axis=1)
benchmark_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Sameple Size: 1,Sameple Size: 2,Sameple Size: 3,Sameple Size: 4,Sameple Size: 5,Sameple Size: 10
Feature Type,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
numeric,accuracy,0.95869,0.956423,0.963392,0.963602,0.963224,0.959698
numeric,precision,0.921727,0.916723,0.928025,0.92615,0.927224,0.917443
numeric,recall,0.966054,0.965347,0.972654,0.975601,0.973126,0.97454
numeric,f1-score,0.94337,0.940406,0.949816,0.950232,0.94962,0.94513
categorical,accuracy,0.91738,0.919647,0.922754,0.924181,0.922217,0.922922
categorical,precision,0.785575,0.801418,0.800264,0.809596,0.800318,0.800395
categorical,recall,0.881838,0.865427,0.885485,0.876915,0.882276,0.886214
categorical,f1-score,0.830928,0.832194,0.84072,0.841912,0.839301,0.841121
datetime,accuracy,0.99597,0.996977,0.997649,0.997229,0.997985,0.996474
datetime,precision,0.971631,0.96875,0.978923,0.983929,0.984441,0.978571


In [7]:
benchmark_df.to_latex('Retrained Sample Size Benchmark.tex')

In [8]:
benchmark_df.to_csv('Retrained Sample Size Benchmark.csv')

In [14]:
for i in range(len(sample_size_list)):
    cmat_df = final_results[i][1]
    accuracy = np.trace(cmat_df) / np.sum(cmat_df)
    print(accuracy)

0.856926952141058
0.8566750629722922
0.8666666666666667
0.8683879093198993
0.8679093198992444
0.8695214105793451
