In [1]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

from Load_Predictions import *
from downstream_models import *
from Featurize import *
from Train_Test_Random_Forest import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dict_label_true = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

In [3]:
table_column_names = ['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
           '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
           'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
           'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
           'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
           'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
           'is_list', 'is_long_sentence', 
           'sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5',
           'sample_6', 'sample_7', 'sample_8', 'sample_9', 'sample_10',]

In [4]:
sample_size_list = [1,2,3,4,5,10]
rand_seed = 100
results = []

final_results = []

for sample_size in sample_size_list:
    
    RF_results = []
    
    xtrain = pd.read_csv(f"Benchmark-Labeled-Data/data_train.csv")
    xtest = pd.read_csv(f"Benchmark-Labeled-Data/data_test.csv")
    
    if sample_size==1:
        xtrain = xtrain.drop(['sample_2', 'sample_3', 'sample_4', 'sample_5'], axis=1)
        xtest = xtest.drop(['sample_2', 'sample_3', 'sample_4', 'sample_5'], axis=1)
    if sample_size==2:
        xtrain = xtrain.drop(['sample_3', 'sample_4', 'sample_5'], axis=1)
        xtest = xtest.drop(['sample_3', 'sample_4', 'sample_5'], axis=1)
    if sample_size==3:
        xtrain = xtrain.drop(['sample_4', 'sample_5'], axis=1)
        xtest = xtest.drop(['sample_4', 'sample_5'], axis=1)
    if sample_size==4:
        xtrain = xtrain.drop(['sample_5'], axis=1)
        xtest = xtest.drop(['sample_5'], axis=1)
        
    if sample_size==10:
        xtrain = pd.read_csv(f"Benchmark-Labeled-Data/10_sample_data_train.csv")
        xtest = pd.read_csv(f"Benchmark-Labeled-Data/10_sample_data_test.csv")

    y_true, RF_results = Train_Test_Random_Forest(xtrain, xtest)
    
    print(y_true)
    print(RF_results)
    
    results = pd.DataFrame(
    {'actual': y_true['y_act'].values,
     'predicted': RF_results
    })
    
    cmat = confusion_matrix(results['actual'], results['predicted'])
    accuracies = []

    for key, value in dict_label_true.items(): 
        TP=0
        FP=0
        TN=0
        FN=0
        for i in range(0,9):
            for j in range(0,9):
                total_actual = sum(cmat[i])
                if (i == value & i == j):
                    TP = cmat[i][j]
                    FP = total_actual-TP
                if i != value:
                    TN += cmat[i][j]
                if (i != value) & (j == value):
                    FN += cmat[i][j]

        print(TP, FP, TN, FN)
        accuracy = (TP+(TN-FN))/(TP+FP+FN+(TN-FN))
        print(accuracy)
        accuracies.append(accuracy)
        
    class_accuracy = list(zip(dict_label_true.keys(), accuracies))
    class_accuracy = pd.DataFrame(class_accuracy).set_index(0).T
    class_accuracy.index = ['accuracy']
    
    classification_report_df = pd.DataFrame(classification_report(results['actual'], results['predicted'], target_names=dict_label_true.keys(), output_dict=True))
    classification_report_df = classification_report_df.loc[['precision', 'recall', 'f1-score']].iloc[: , :-3]
    
    result_df = class_accuracy.append(classification_report_df)
    final_results.append([result_df, cmat])



[n_estimator: 5, max_depth: 5, accuracy: 0.6093257718966604]
[n_estimator: 5, max_depth: 10, accuracy: 0.8021424070573409]
[n_estimator: 5, max_depth: 25, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 50, accuracy: 0.9054820415879017]
[n_estimator: 5, max_depth: 100, accuracy: 0.9042218021424071]
[n_estimator: 5, max_depth: 250, accuracy: 0.9042218021424071]
[n_estimator: 25, max_depth: 5, accuracy: 0.7038437303087587]
[n_estimator: 25, max_depth: 10, accuracy: 0.8204158790170132]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086326402016384]
[n_estimator: 25, max_depth: 50, accuracy: 0.9319470699432892]
[n_estimator: 25, max_depth: 100, accuracy: 0.9332073093887838]
[n_estimator: 25, max_depth: 250, accuracy: 0.9332073093887838]
[n_estimator: 50, max_depth: 5, accuracy: 0.7410207939508506]
[n_estimator: 50, max_depth: 10, accuracy: 0.8235664776307499]
[n_estimator: 50, max_depth: 25, accuracy: 0.9143037177063642]
[n_estimator: 50, max_depth: 50, accuracy: 0.930686830497794

[n_estimator: 50, max_depth: 25, accuracy: 0.9143576826196473]
[n_estimator: 50, max_depth: 50, accuracy: 0.9332493702770781]
[n_estimator: 50, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 50, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 5, accuracy: 0.7122166246851386]
[n_estimator: 75, max_depth: 10, accuracy: 0.8154911838790933]
[n_estimator: 75, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 75, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 5, accuracy: 0.7260705289672544]
[n_estimator: 100, max_depth: 10, accuracy: 0.8079345088161209]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9326196473551638]
[n_estimator: 100, max_depth: 100, accuracy: 0.931360201511335]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6093257718966604]
[n_estimator: 5, max_depth: 10, accuracy: 0.8021424070573409]
[n_estimator: 5, max_depth: 25, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 50, accuracy: 0.9054820415879017]
[n_estimator: 5, max_depth: 100, accuracy: 0.9042218021424071]
[n_estimator: 5, max_depth: 250, accuracy: 0.9042218021424071]
[n_estimator: 25, max_depth: 5, accuracy: 0.7038437303087587]
[n_estimator: 25, max_depth: 10, accuracy: 0.8204158790170132]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086326402016384]
[n_estimator: 25, max_depth: 50, accuracy: 0.9319470699432892]
[n_estimator: 25, max_depth: 100, accuracy: 0.9332073093887838]
[n_estimator: 25, max_depth: 250, accuracy: 0.9332073093887838]
[n_estimator: 50, max_depth: 5, accuracy: 0.7410207939508506]
[n_estimator: 50, max_depth: 10, accuracy: 0.8235664776307499]
[n_estimator: 50, max_depth: 25, accuracy: 0.9143037177063642]
[n_estimator: 50, max_depth: 50, accuracy: 0.930686830497794

[n_estimator: 50, max_depth: 25, accuracy: 0.9143576826196473]
[n_estimator: 50, max_depth: 50, accuracy: 0.9332493702770781]
[n_estimator: 50, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 50, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 5, accuracy: 0.7122166246851386]
[n_estimator: 75, max_depth: 10, accuracy: 0.8154911838790933]
[n_estimator: 75, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 75, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 5, accuracy: 0.7260705289672544]
[n_estimator: 100, max_depth: 10, accuracy: 0.8079345088161209]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9326196473551638]
[n_estimator: 100, max_depth: 100, accuracy: 0.931360201511335]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6093257718966604]
[n_estimator: 5, max_depth: 10, accuracy: 0.8021424070573409]
[n_estimator: 5, max_depth: 25, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 50, accuracy: 0.9054820415879017]
[n_estimator: 5, max_depth: 100, accuracy: 0.9042218021424071]
[n_estimator: 5, max_depth: 250, accuracy: 0.9042218021424071]
[n_estimator: 25, max_depth: 5, accuracy: 0.7038437303087587]
[n_estimator: 25, max_depth: 10, accuracy: 0.8204158790170132]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086326402016384]
[n_estimator: 25, max_depth: 50, accuracy: 0.9319470699432892]
[n_estimator: 25, max_depth: 100, accuracy: 0.9332073093887838]
[n_estimator: 25, max_depth: 250, accuracy: 0.9332073093887838]
[n_estimator: 50, max_depth: 5, accuracy: 0.7410207939508506]
[n_estimator: 50, max_depth: 10, accuracy: 0.8235664776307499]
[n_estimator: 50, max_depth: 25, accuracy: 0.9143037177063642]
[n_estimator: 50, max_depth: 50, accuracy: 0.930686830497794

[n_estimator: 50, max_depth: 25, accuracy: 0.9143576826196473]
[n_estimator: 50, max_depth: 50, accuracy: 0.9332493702770781]
[n_estimator: 50, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 50, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 5, accuracy: 0.7122166246851386]
[n_estimator: 75, max_depth: 10, accuracy: 0.8154911838790933]
[n_estimator: 75, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 75, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 5, accuracy: 0.7260705289672544]
[n_estimator: 100, max_depth: 10, accuracy: 0.8079345088161209]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9326196473551638]
[n_estimator: 100, max_depth: 100, accuracy: 0.931360201511335]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6093257718966604]
[n_estimator: 5, max_depth: 10, accuracy: 0.8021424070573409]
[n_estimator: 5, max_depth: 25, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 50, accuracy: 0.9054820415879017]
[n_estimator: 5, max_depth: 100, accuracy: 0.9042218021424071]
[n_estimator: 5, max_depth: 250, accuracy: 0.9042218021424071]
[n_estimator: 25, max_depth: 5, accuracy: 0.7038437303087587]
[n_estimator: 25, max_depth: 10, accuracy: 0.8204158790170132]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086326402016384]
[n_estimator: 25, max_depth: 50, accuracy: 0.9319470699432892]
[n_estimator: 25, max_depth: 100, accuracy: 0.9332073093887838]
[n_estimator: 25, max_depth: 250, accuracy: 0.9332073093887838]
[n_estimator: 50, max_depth: 5, accuracy: 0.7410207939508506]
[n_estimator: 50, max_depth: 10, accuracy: 0.8235664776307499]
[n_estimator: 50, max_depth: 25, accuracy: 0.9143037177063642]
[n_estimator: 50, max_depth: 50, accuracy: 0.930686830497794

[n_estimator: 50, max_depth: 25, accuracy: 0.9143576826196473]
[n_estimator: 50, max_depth: 50, accuracy: 0.9332493702770781]
[n_estimator: 50, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 50, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 5, accuracy: 0.7122166246851386]
[n_estimator: 75, max_depth: 10, accuracy: 0.8154911838790933]
[n_estimator: 75, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 75, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 5, accuracy: 0.7260705289672544]
[n_estimator: 100, max_depth: 10, accuracy: 0.8079345088161209]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9326196473551638]
[n_estimator: 100, max_depth: 100, accuracy: 0.931360201511335]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6093257718966604]
[n_estimator: 5, max_depth: 10, accuracy: 0.8021424070573409]
[n_estimator: 5, max_depth: 25, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 50, accuracy: 0.9054820415879017]
[n_estimator: 5, max_depth: 100, accuracy: 0.9042218021424071]
[n_estimator: 5, max_depth: 250, accuracy: 0.9042218021424071]
[n_estimator: 25, max_depth: 5, accuracy: 0.7038437303087587]
[n_estimator: 25, max_depth: 10, accuracy: 0.8204158790170132]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086326402016384]
[n_estimator: 25, max_depth: 50, accuracy: 0.9319470699432892]
[n_estimator: 25, max_depth: 100, accuracy: 0.9332073093887838]
[n_estimator: 25, max_depth: 250, accuracy: 0.9332073093887838]
[n_estimator: 50, max_depth: 5, accuracy: 0.7410207939508506]
[n_estimator: 50, max_depth: 10, accuracy: 0.8235664776307499]
[n_estimator: 50, max_depth: 25, accuracy: 0.9143037177063642]
[n_estimator: 50, max_depth: 50, accuracy: 0.930686830497794

[n_estimator: 50, max_depth: 25, accuracy: 0.9143576826196473]
[n_estimator: 50, max_depth: 50, accuracy: 0.9332493702770781]
[n_estimator: 50, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 50, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 5, accuracy: 0.7122166246851386]
[n_estimator: 75, max_depth: 10, accuracy: 0.8154911838790933]
[n_estimator: 75, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 75, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 100, accuracy: 0.9307304785894207]
[n_estimator: 75, max_depth: 250, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 5, accuracy: 0.7260705289672544]
[n_estimator: 100, max_depth: 10, accuracy: 0.8079345088161209]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9326196473551638]
[n_estimator: 100, max_depth: 100, accuracy: 0.931360201511335]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6786389413988658]
[n_estimator: 5, max_depth: 10, accuracy: 0.7674858223062382]
[n_estimator: 5, max_depth: 25, accuracy: 0.8796471329552615]
[n_estimator: 5, max_depth: 50, accuracy: 0.9004410838059231]
[n_estimator: 5, max_depth: 100, accuracy: 0.908002520478891]
[n_estimator: 5, max_depth: 250, accuracy: 0.908002520478891]
[n_estimator: 25, max_depth: 5, accuracy: 0.6969124133585382]
[n_estimator: 25, max_depth: 10, accuracy: 0.8160050409577819]
[n_estimator: 25, max_depth: 25, accuracy: 0.9124133585381222]
[n_estimator: 25, max_depth: 50, accuracy: 0.9224952741020794]
[n_estimator: 25, max_depth: 100, accuracy: 0.923755513547574]
[n_estimator: 25, max_depth: 250, accuracy: 0.923755513547574]
[n_estimator: 50, max_depth: 5, accuracy: 0.6811594202898551]
[n_estimator: 50, max_depth: 10, accuracy: 0.8166351606805293]
[n_estimator: 50, max_depth: 25, accuracy: 0.9149338374291115]
[n_estimator: 50, max_depth: 50, accuracy: 0.925645872715816]
[n

[n_estimator: 50, max_depth: 25, accuracy: 0.9118387909319899]
[n_estimator: 50, max_depth: 50, accuracy: 0.9319899244332494]
[n_estimator: 50, max_depth: 100, accuracy: 0.9288413098236776]
[n_estimator: 50, max_depth: 250, accuracy: 0.9288413098236776]
[n_estimator: 75, max_depth: 5, accuracy: 0.6939546599496221]
[n_estimator: 75, max_depth: 10, accuracy: 0.8123425692695214]
[n_estimator: 75, max_depth: 25, accuracy: 0.9175062972292192]
[n_estimator: 75, max_depth: 50, accuracy: 0.9319899244332494]
[n_estimator: 75, max_depth: 100, accuracy: 0.9351385390428212]
[n_estimator: 75, max_depth: 250, accuracy: 0.9351385390428212]
[n_estimator: 100, max_depth: 5, accuracy: 0.6914357682619647]
[n_estimator: 100, max_depth: 10, accuracy: 0.8085642317380353]
[n_estimator: 100, max_depth: 25, accuracy: 0.9149874055415617]
[n_estimator: 100, max_depth: 50, accuracy: 0.9307304785894207]
[n_estimator: 100, max_depth: 100, accuracy: 0.9332493702770781]
[n_estimator: 100, max_depth: 250, accuracy: 0.

In [5]:
sample_size_results_list = []

for i in range(len(sample_size_list)):
    results_df = final_results[i][0]
    
    results_df = pd.DataFrame(results_df.T.stack()).rename({0: f"Sample Size: {sample_size_list[i]}"}, axis=1)
    results_df.index = results_df.index.set_names(['Feature Type', 'Metric'])
    
    sample_size_results_list.append(results_df)

In [6]:
benchmark_df = pd.concat(sample_size_results_list, axis=1)
benchmark_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Sample Size: 1,Sample Size: 2,Sample Size: 3,Sample Size: 4,Sample Size: 5,Sample Size: 10
Feature Type,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
numeric,accuracy,0.971788,0.971788,0.971788,0.971788,0.971788,0.969773
numeric,precision,0.936913,0.936913,0.936913,0.936913,0.936913,0.931909
numeric,recall,0.98727,0.98727,0.98727,0.98727,0.98727,0.98727
numeric,f1-score,0.961433,0.961433,0.961433,0.961433,0.961433,0.958791
categorical,accuracy,0.968262,0.968262,0.968262,0.968262,0.968262,0.966751
categorical,precision,0.913866,0.913866,0.913866,0.913866,0.913866,0.911579
categorical,recall,0.95186,0.95186,0.95186,0.95186,0.95186,0.947484
categorical,f1-score,0.932476,0.932476,0.932476,0.932476,0.932476,0.929185
datetime,accuracy,0.996977,0.996977,0.996977,0.996977,0.996977,0.996977
datetime,precision,0.985612,0.985612,0.985612,0.985612,0.985612,0.985612


In [16]:
benchmark_df.to_latex('BaseFeatureSampleValues.tex')

In [17]:
benchmark_df.to_csv('BaseFeatureSampleValues.csv')

In [9]:
for i in range(len(sample_size_list)):
    cmat_df = final_results[i][1]
    accuracy = np.trace(cmat_df) / np.sum(cmat_df)
    print(accuracy)

0.9284634760705289
0.9284634760705289
0.9284634760705289
0.9284634760705289
0.9284634760705289
0.9259445843828715
