In [1]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

from Load_Predictions import *
from downstream_models import *
from Featurize import *
from Train_Test_Random_Forest import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dict_label_true = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

In [3]:
subset_proportion_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
rand_seed = 100
results = []

final_results = []

for subset_proportion in subset_proportion_list:
    
    RF_results = []
    
    xtrain = pd.read_csv(f"DescriptiveStatisticSubset-Data/{subset_proportion}_sample_data_train.csv")
    xtest = pd.read_csv(f"DescriptiveStatisticSubset-Data/{subset_proportion}_sample_data_test.csv")
    y_true, RF_results = Train_Test_Random_Forest(xtrain, xtest)
    
    results = pd.DataFrame(
    {'actual': y_true['y_act'].values,
     'predicted': RF_results
    })
    
    cmat = confusion_matrix(results['actual'], results['predicted'])
    accuracies = []

    for key, value in dict_label_true.items(): 
        TP=0
        FP=0
        TN=0
        FN=0
        for i in range(0,9):
            for j in range(0,9):
                total_actual = sum(cmat[i])
                if (i == value & i == j):
                    TP = cmat[i][j]
                    FP = total_actual-TP
                if i != value:
                    TN += cmat[i][j]
                if (i != value) & (j == value):
                    FN += cmat[i][j]

        print(TP, FP, TN, FN)
        accuracy = (TP+(TN-FN))/(TP+FP+FN+(TN-FN))
        print(accuracy)
        accuracies.append(accuracy)
        
    class_accuracy = list(zip(dict_label_true.keys(), accuracies))
    class_accuracy = pd.DataFrame(class_accuracy).set_index(0).T
    class_accuracy.index = ['accuracy']
    
    classification_report_df = pd.DataFrame(classification_report(results['actual'], results['predicted'], target_names=dict_label_true.keys(), output_dict=True))
    classification_report_df = classification_report_df.loc[['precision', 'recall', 'f1-score']].iloc[: , :-3]
    
    result_df = class_accuracy.append(classification_report_df)
    final_results.append([result_df, cmat])



[n_estimator: 5, max_depth: 5, accuracy: 0.6154864328259431]
[n_estimator: 5, max_depth: 10, accuracy: 0.7538054268696228]
[n_estimator: 5, max_depth: 25, accuracy: 0.8318994043679683]
[n_estimator: 5, max_depth: 50, accuracy: 0.8596955658504302]
[n_estimator: 5, max_depth: 100, accuracy: 0.8669755129053607]
[n_estimator: 5, max_depth: 250, accuracy: 0.8669755129053607]
[n_estimator: 25, max_depth: 5, accuracy: 0.6737260092653872]
[n_estimator: 25, max_depth: 10, accuracy: 0.771012574454004]
[n_estimator: 25, max_depth: 25, accuracy: 0.8696227663798809]
[n_estimator: 25, max_depth: 50, accuracy: 0.8815354070152217]
[n_estimator: 25, max_depth: 100, accuracy: 0.8894771674387822]
[n_estimator: 25, max_depth: 250, accuracy: 0.8894771674387822]
[n_estimator: 50, max_depth: 5, accuracy: 0.6307081403044341]
[n_estimator: 50, max_depth: 10, accuracy: 0.7643944407677035]
[n_estimator: 50, max_depth: 25, accuracy: 0.8663136995367307]
[n_estimator: 50, max_depth: 50, accuracy: 0.8874917273328922

[n_estimator: 50, max_depth: 25, accuracy: 0.8775645268034414]
[n_estimator: 50, max_depth: 50, accuracy: 0.8901389808074123]
[n_estimator: 50, max_depth: 100, accuracy: 0.8874917273328922]
[n_estimator: 50, max_depth: 250, accuracy: 0.8874917273328922]
[n_estimator: 75, max_depth: 5, accuracy: 0.6882859033752482]
[n_estimator: 75, max_depth: 10, accuracy: 0.7935142289874255]
[n_estimator: 75, max_depth: 25, accuracy: 0.8755790866975512]
[n_estimator: 75, max_depth: 50, accuracy: 0.886829913964262]
[n_estimator: 75, max_depth: 100, accuracy: 0.8894771674387822]
[n_estimator: 75, max_depth: 250, accuracy: 0.8894771674387822]
[n_estimator: 100, max_depth: 5, accuracy: 0.6968894771674388]
[n_estimator: 100, max_depth: 10, accuracy: 0.7941760423560555]
[n_estimator: 100, max_depth: 25, accuracy: 0.8802117802779617]
[n_estimator: 100, max_depth: 50, accuracy: 0.8881535407015222]
[n_estimator: 100, max_depth: 100, accuracy: 0.8881535407015222]
[n_estimator: 100, max_depth: 250, accuracy: 0.8



[n_estimator: 5, max_depth: 5, accuracy: 0.6612057667103539]
[n_estimator: 5, max_depth: 10, accuracy: 0.7293577981651376]
[n_estimator: 5, max_depth: 25, accuracy: 0.8348623853211009]
[n_estimator: 5, max_depth: 50, accuracy: 0.8591087811271297]
[n_estimator: 5, max_depth: 100, accuracy: 0.8492791612057667]
[n_estimator: 5, max_depth: 250, accuracy: 0.8492791612057667]
[n_estimator: 25, max_depth: 5, accuracy: 0.6677588466579292]
[n_estimator: 25, max_depth: 10, accuracy: 0.752948885976409]
[n_estimator: 25, max_depth: 25, accuracy: 0.8617300131061599]
[n_estimator: 25, max_depth: 50, accuracy: 0.8800786369593709]
[n_estimator: 25, max_depth: 100, accuracy: 0.8754914809960681]
[n_estimator: 25, max_depth: 250, accuracy: 0.8754914809960681]
[n_estimator: 50, max_depth: 5, accuracy: 0.6428571428571429]
[n_estimator: 50, max_depth: 10, accuracy: 0.7640891218872871]
[n_estimator: 50, max_depth: 25, accuracy: 0.8663171690694627]
[n_estimator: 50, max_depth: 50, accuracy: 0.8872870249017037

[n_estimator: 50, max_depth: 25, accuracy: 0.8663171690694627]
[n_estimator: 50, max_depth: 50, accuracy: 0.8866317169069463]
[n_estimator: 50, max_depth: 100, accuracy: 0.8879423328964613]
[n_estimator: 50, max_depth: 250, accuracy: 0.8879423328964613]
[n_estimator: 75, max_depth: 5, accuracy: 0.6317169069462647]
[n_estimator: 75, max_depth: 10, accuracy: 0.7844036697247706]
[n_estimator: 75, max_depth: 25, accuracy: 0.8702490170380078]
[n_estimator: 75, max_depth: 50, accuracy: 0.8859764089121888]
[n_estimator: 75, max_depth: 100, accuracy: 0.8879423328964613]
[n_estimator: 75, max_depth: 250, accuracy: 0.8879423328964613]
[n_estimator: 100, max_depth: 5, accuracy: 0.6225425950196593]
[n_estimator: 100, max_depth: 10, accuracy: 0.7876802096985583]
[n_estimator: 100, max_depth: 25, accuracy: 0.8669724770642202]
[n_estimator: 100, max_depth: 50, accuracy: 0.8866317169069463]
[n_estimator: 100, max_depth: 100, accuracy: 0.8872870249017037]
[n_estimator: 100, max_depth: 250, accuracy: 0.



[n_estimator: 5, max_depth: 5, accuracy: 0.6487549148099607]
[n_estimator: 5, max_depth: 10, accuracy: 0.7293577981651376]
[n_estimator: 5, max_depth: 25, accuracy: 0.8420707732634338]
[n_estimator: 5, max_depth: 50, accuracy: 0.8479685452162516]
[n_estimator: 5, max_depth: 100, accuracy: 0.8466579292267365]
[n_estimator: 5, max_depth: 250, accuracy: 0.8466579292267365]
[n_estimator: 25, max_depth: 5, accuracy: 0.6710353866317169]
[n_estimator: 25, max_depth: 10, accuracy: 0.752948885976409]
[n_estimator: 25, max_depth: 25, accuracy: 0.8702490170380078]
[n_estimator: 25, max_depth: 50, accuracy: 0.8833551769331586]
[n_estimator: 25, max_depth: 100, accuracy: 0.8820445609436435]
[n_estimator: 25, max_depth: 250, accuracy: 0.8820445609436435]
[n_estimator: 50, max_depth: 5, accuracy: 0.6644823066841415]
[n_estimator: 50, max_depth: 10, accuracy: 0.772608125819135]
[n_estimator: 50, max_depth: 25, accuracy: 0.8722149410222805]
[n_estimator: 50, max_depth: 50, accuracy: 0.8879423328964613]

[n_estimator: 50, max_depth: 25, accuracy: 0.8709043250327654]
[n_estimator: 50, max_depth: 50, accuracy: 0.8931847968545217]
[n_estimator: 50, max_depth: 100, accuracy: 0.8879423328964613]
[n_estimator: 50, max_depth: 250, accuracy: 0.8879423328964613]
[n_estimator: 75, max_depth: 5, accuracy: 0.6415465268676278]
[n_estimator: 75, max_depth: 10, accuracy: 0.7798165137614679]
[n_estimator: 75, max_depth: 25, accuracy: 0.8741808650065531]
[n_estimator: 75, max_depth: 50, accuracy: 0.8885976408912188]
[n_estimator: 75, max_depth: 100, accuracy: 0.8931847968545217]
[n_estimator: 75, max_depth: 250, accuracy: 0.8931847968545217]
[n_estimator: 100, max_depth: 5, accuracy: 0.6336828309305373]
[n_estimator: 100, max_depth: 10, accuracy: 0.7844036697247706]
[n_estimator: 100, max_depth: 25, accuracy: 0.8741808650065531]
[n_estimator: 100, max_depth: 50, accuracy: 0.8918741808650066]
[n_estimator: 100, max_depth: 100, accuracy: 0.8925294888597641]
[n_estimator: 100, max_depth: 250, accuracy: 0.



[n_estimator: 5, max_depth: 5, accuracy: 0.6644823066841415]
[n_estimator: 5, max_depth: 10, accuracy: 0.7536041939711664]
[n_estimator: 5, max_depth: 25, accuracy: 0.8446920052424639]
[n_estimator: 5, max_depth: 50, accuracy: 0.8525557011795544]
[n_estimator: 5, max_depth: 100, accuracy: 0.854521625163827]
[n_estimator: 5, max_depth: 250, accuracy: 0.854521625163827]
[n_estimator: 25, max_depth: 5, accuracy: 0.6625163826998689]
[n_estimator: 25, max_depth: 10, accuracy: 0.7634338138925295]
[n_estimator: 25, max_depth: 25, accuracy: 0.8702490170380078]
[n_estimator: 25, max_depth: 50, accuracy: 0.8866317169069463]
[n_estimator: 25, max_depth: 100, accuracy: 0.8866317169069463]
[n_estimator: 25, max_depth: 250, accuracy: 0.8866317169069463]
[n_estimator: 50, max_depth: 5, accuracy: 0.6500655307994757]
[n_estimator: 50, max_depth: 10, accuracy: 0.7706422018348624]
[n_estimator: 50, max_depth: 25, accuracy: 0.8735255570117956]
[n_estimator: 50, max_depth: 50, accuracy: 0.8938401048492791]

[n_estimator: 50, max_depth: 25, accuracy: 0.8787680209698558]
[n_estimator: 50, max_depth: 50, accuracy: 0.8885976408912188]
[n_estimator: 50, max_depth: 100, accuracy: 0.891218872870249]
[n_estimator: 50, max_depth: 250, accuracy: 0.891218872870249]
[n_estimator: 75, max_depth: 5, accuracy: 0.63564875491481]
[n_estimator: 75, max_depth: 10, accuracy: 0.7870249017038008]
[n_estimator: 75, max_depth: 25, accuracy: 0.8800786369593709]
[n_estimator: 75, max_depth: 50, accuracy: 0.891218872870249]
[n_estimator: 75, max_depth: 100, accuracy: 0.8931847968545217]
[n_estimator: 75, max_depth: 250, accuracy: 0.8931847968545217]
[n_estimator: 100, max_depth: 5, accuracy: 0.6258191349934469]
[n_estimator: 100, max_depth: 10, accuracy: 0.7863695937090432]
[n_estimator: 100, max_depth: 25, accuracy: 0.8781127129750983]
[n_estimator: 100, max_depth: 50, accuracy: 0.8885976408912188]
[n_estimator: 100, max_depth: 100, accuracy: 0.8951507208387942]
[n_estimator: 100, max_depth: 250, accuracy: 0.89515



[n_estimator: 5, max_depth: 5, accuracy: 0.6671035386631717]
[n_estimator: 5, max_depth: 10, accuracy: 0.7234600262123198]
[n_estimator: 5, max_depth: 25, accuracy: 0.8440366972477065]
[n_estimator: 5, max_depth: 50, accuracy: 0.8512450851900393]
[n_estimator: 5, max_depth: 100, accuracy: 0.8669724770642202]
[n_estimator: 5, max_depth: 250, accuracy: 0.8669724770642202]
[n_estimator: 25, max_depth: 5, accuracy: 0.6546526867627785]
[n_estimator: 25, max_depth: 10, accuracy: 0.7680209698558322]
[n_estimator: 25, max_depth: 25, accuracy: 0.8709043250327654]
[n_estimator: 25, max_depth: 50, accuracy: 0.8892529488859764]
[n_estimator: 25, max_depth: 100, accuracy: 0.891218872870249]
[n_estimator: 25, max_depth: 250, accuracy: 0.891218872870249]
[n_estimator: 50, max_depth: 5, accuracy: 0.6435124508519003]
[n_estimator: 50, max_depth: 10, accuracy: 0.7706422018348624]
[n_estimator: 50, max_depth: 25, accuracy: 0.8689384010484927]
[n_estimator: 50, max_depth: 50, accuracy: 0.8938401048492791]

[n_estimator: 50, max_depth: 25, accuracy: 0.8748361730013107]
[n_estimator: 50, max_depth: 50, accuracy: 0.8964613368283093]
[n_estimator: 50, max_depth: 100, accuracy: 0.8964613368283093]
[n_estimator: 50, max_depth: 250, accuracy: 0.8964613368283093]
[n_estimator: 75, max_depth: 5, accuracy: 0.6428571428571429]
[n_estimator: 75, max_depth: 10, accuracy: 0.7844036697247706]
[n_estimator: 75, max_depth: 25, accuracy: 0.8807339449541285]
[n_estimator: 75, max_depth: 50, accuracy: 0.898427260812582]
[n_estimator: 75, max_depth: 100, accuracy: 0.9017038007863696]
[n_estimator: 75, max_depth: 250, accuracy: 0.9017038007863696]
[n_estimator: 100, max_depth: 5, accuracy: 0.6317169069462647]
[n_estimator: 100, max_depth: 10, accuracy: 0.7850589777195282]
[n_estimator: 100, max_depth: 25, accuracy: 0.8768020969855832]
[n_estimator: 100, max_depth: 50, accuracy: 0.8977719528178244]
[n_estimator: 100, max_depth: 100, accuracy: 0.9003931847968545]
[n_estimator: 100, max_depth: 250, accuracy: 0.9



[n_estimator: 5, max_depth: 5, accuracy: 0.6572739187418086]
[n_estimator: 5, max_depth: 10, accuracy: 0.7300131061598951]
[n_estimator: 5, max_depth: 25, accuracy: 0.8446920052424639]
[n_estimator: 5, max_depth: 50, accuracy: 0.8623853211009175]
[n_estimator: 5, max_depth: 100, accuracy: 0.8669724770642202]
[n_estimator: 5, max_depth: 250, accuracy: 0.8669724770642202]
[n_estimator: 25, max_depth: 5, accuracy: 0.6644823066841415]
[n_estimator: 25, max_depth: 10, accuracy: 0.7621231979030144]
[n_estimator: 25, max_depth: 25, accuracy: 0.86435124508519]
[n_estimator: 25, max_depth: 50, accuracy: 0.8872870249017037]
[n_estimator: 25, max_depth: 100, accuracy: 0.8918741808650066]
[n_estimator: 25, max_depth: 250, accuracy: 0.8918741808650066]
[n_estimator: 50, max_depth: 5, accuracy: 0.6553079947575361]
[n_estimator: 50, max_depth: 10, accuracy: 0.7680209698558322]
[n_estimator: 50, max_depth: 25, accuracy: 0.8741808650065531]
[n_estimator: 50, max_depth: 50, accuracy: 0.8944954128440367]

[n_estimator: 50, max_depth: 25, accuracy: 0.8794233289646134]
[n_estimator: 50, max_depth: 50, accuracy: 0.8964613368283093]
[n_estimator: 50, max_depth: 100, accuracy: 0.8892529488859764]
[n_estimator: 50, max_depth: 250, accuracy: 0.8892529488859764]
[n_estimator: 75, max_depth: 5, accuracy: 0.6546526867627785]
[n_estimator: 75, max_depth: 10, accuracy: 0.7883355176933159]
[n_estimator: 75, max_depth: 25, accuracy: 0.8748361730013107]
[n_estimator: 75, max_depth: 50, accuracy: 0.8958060288335518]
[n_estimator: 75, max_depth: 100, accuracy: 0.8938401048492791]
[n_estimator: 75, max_depth: 250, accuracy: 0.8938401048492791]
[n_estimator: 100, max_depth: 5, accuracy: 0.6376146788990825]
[n_estimator: 100, max_depth: 10, accuracy: 0.7863695937090432]
[n_estimator: 100, max_depth: 25, accuracy: 0.8735255570117956]
[n_estimator: 100, max_depth: 50, accuracy: 0.8958060288335518]
[n_estimator: 100, max_depth: 100, accuracy: 0.8977719528178244]
[n_estimator: 100, max_depth: 250, accuracy: 0.



[n_estimator: 5, max_depth: 5, accuracy: 0.6782437745740498]
[n_estimator: 5, max_depth: 10, accuracy: 0.77129750982962]
[n_estimator: 5, max_depth: 25, accuracy: 0.8532110091743119]
[n_estimator: 5, max_depth: 50, accuracy: 0.8623853211009175]
[n_estimator: 5, max_depth: 100, accuracy: 0.8571428571428571]
[n_estimator: 5, max_depth: 250, accuracy: 0.8571428571428571]
[n_estimator: 25, max_depth: 5, accuracy: 0.6847968545216252]
[n_estimator: 25, max_depth: 10, accuracy: 0.7863695937090432]
[n_estimator: 25, max_depth: 25, accuracy: 0.872870249017038]
[n_estimator: 25, max_depth: 50, accuracy: 0.8879423328964613]
[n_estimator: 25, max_depth: 100, accuracy: 0.8846657929226737]
[n_estimator: 25, max_depth: 250, accuracy: 0.8846657929226737]
[n_estimator: 50, max_depth: 5, accuracy: 0.6598951507208388]
[n_estimator: 50, max_depth: 10, accuracy: 0.7686762778505898]
[n_estimator: 50, max_depth: 25, accuracy: 0.8715596330275229]
[n_estimator: 50, max_depth: 50, accuracy: 0.8925294888597641]


[n_estimator: 50, max_depth: 25, accuracy: 0.8840104849279161]
[n_estimator: 50, max_depth: 50, accuracy: 0.8944954128440367]
[n_estimator: 50, max_depth: 100, accuracy: 0.8944954128440367]
[n_estimator: 50, max_depth: 250, accuracy: 0.8944954128440367]
[n_estimator: 75, max_depth: 5, accuracy: 0.6520314547837484]
[n_estimator: 75, max_depth: 10, accuracy: 0.7857142857142857]
[n_estimator: 75, max_depth: 25, accuracy: 0.8813892529488859]
[n_estimator: 75, max_depth: 50, accuracy: 0.8977719528178244]
[n_estimator: 75, max_depth: 100, accuracy: 0.8977719528178244]
[n_estimator: 75, max_depth: 250, accuracy: 0.8977719528178244]
[n_estimator: 100, max_depth: 5, accuracy: 0.6480996068152032]
[n_estimator: 100, max_depth: 10, accuracy: 0.7889908256880734]
[n_estimator: 100, max_depth: 25, accuracy: 0.8813892529488859]
[n_estimator: 100, max_depth: 50, accuracy: 0.898427260812582]
[n_estimator: 100, max_depth: 100, accuracy: 0.898427260812582]
[n_estimator: 100, max_depth: 250, accuracy: 0.89



[n_estimator: 5, max_depth: 5, accuracy: 0.6703800786369594]
[n_estimator: 5, max_depth: 10, accuracy: 0.7693315858453473]
[n_estimator: 5, max_depth: 25, accuracy: 0.8525557011795544]
[n_estimator: 5, max_depth: 50, accuracy: 0.8656618610747051]
[n_estimator: 5, max_depth: 100, accuracy: 0.8754914809960681]
[n_estimator: 5, max_depth: 250, accuracy: 0.8754914809960681]
[n_estimator: 25, max_depth: 5, accuracy: 0.6657929226736566]
[n_estimator: 25, max_depth: 10, accuracy: 0.755570117955439]
[n_estimator: 25, max_depth: 25, accuracy: 0.8722149410222805]
[n_estimator: 25, max_depth: 50, accuracy: 0.8853211009174312]
[n_estimator: 25, max_depth: 100, accuracy: 0.8853211009174312]
[n_estimator: 25, max_depth: 250, accuracy: 0.8853211009174312]
[n_estimator: 50, max_depth: 5, accuracy: 0.6559633027522935]
[n_estimator: 50, max_depth: 10, accuracy: 0.7680209698558322]
[n_estimator: 50, max_depth: 25, accuracy: 0.8735255570117956]
[n_estimator: 50, max_depth: 50, accuracy: 0.8885976408912188

[n_estimator: 50, max_depth: 25, accuracy: 0.8794233289646134]
[n_estimator: 50, max_depth: 50, accuracy: 0.8997378768020969]
[n_estimator: 50, max_depth: 100, accuracy: 0.898427260812582]
[n_estimator: 50, max_depth: 250, accuracy: 0.898427260812582]
[n_estimator: 75, max_depth: 5, accuracy: 0.6467889908256881]
[n_estimator: 75, max_depth: 10, accuracy: 0.7850589777195282]
[n_estimator: 75, max_depth: 25, accuracy: 0.8813892529488859]
[n_estimator: 75, max_depth: 50, accuracy: 0.898427260812582]
[n_estimator: 75, max_depth: 100, accuracy: 0.8990825688073395]
[n_estimator: 75, max_depth: 250, accuracy: 0.8990825688073395]
[n_estimator: 100, max_depth: 5, accuracy: 0.6480996068152032]
[n_estimator: 100, max_depth: 10, accuracy: 0.7824377457404981]
[n_estimator: 100, max_depth: 25, accuracy: 0.8820445609436435]
[n_estimator: 100, max_depth: 50, accuracy: 0.8964613368283093]
[n_estimator: 100, max_depth: 100, accuracy: 0.9003931847968545]
[n_estimator: 100, max_depth: 250, accuracy: 0.900



[n_estimator: 5, max_depth: 5, accuracy: 0.6572739187418086]
[n_estimator: 5, max_depth: 10, accuracy: 0.7477064220183486]
[n_estimator: 5, max_depth: 25, accuracy: 0.8335517693315858]
[n_estimator: 5, max_depth: 50, accuracy: 0.8492791612057667]
[n_estimator: 5, max_depth: 100, accuracy: 0.8650065530799476]
[n_estimator: 5, max_depth: 250, accuracy: 0.8650065530799476]
[n_estimator: 25, max_depth: 5, accuracy: 0.6487549148099607]
[n_estimator: 25, max_depth: 10, accuracy: 0.7608125819134993]
[n_estimator: 25, max_depth: 25, accuracy: 0.8669724770642202]
[n_estimator: 25, max_depth: 50, accuracy: 0.8800786369593709]
[n_estimator: 25, max_depth: 100, accuracy: 0.8879423328964613]
[n_estimator: 25, max_depth: 250, accuracy: 0.8879423328964613]
[n_estimator: 50, max_depth: 5, accuracy: 0.6513761467889908]
[n_estimator: 50, max_depth: 10, accuracy: 0.7693315858453473]
[n_estimator: 50, max_depth: 25, accuracy: 0.8761467889908257]
[n_estimator: 50, max_depth: 50, accuracy: 0.887287024901703

[n_estimator: 50, max_depth: 25, accuracy: 0.8781127129750983]
[n_estimator: 50, max_depth: 50, accuracy: 0.8958060288335518]
[n_estimator: 50, max_depth: 100, accuracy: 0.9003931847968545]
[n_estimator: 50, max_depth: 250, accuracy: 0.9003931847968545]
[n_estimator: 75, max_depth: 5, accuracy: 0.6605504587155964]
[n_estimator: 75, max_depth: 10, accuracy: 0.783748361730013]
[n_estimator: 75, max_depth: 25, accuracy: 0.8761467889908257]
[n_estimator: 75, max_depth: 50, accuracy: 0.8990825688073395]
[n_estimator: 75, max_depth: 100, accuracy: 0.8990825688073395]
[n_estimator: 75, max_depth: 250, accuracy: 0.8990825688073395]
[n_estimator: 100, max_depth: 5, accuracy: 0.6618610747051114]
[n_estimator: 100, max_depth: 10, accuracy: 0.7857142857142857]
[n_estimator: 100, max_depth: 25, accuracy: 0.8768020969855832]
[n_estimator: 100, max_depth: 50, accuracy: 0.8990825688073395]
[n_estimator: 100, max_depth: 100, accuracy: 0.8997378768020969]
[n_estimator: 100, max_depth: 250, accuracy: 0.8

In [4]:
subset_proportion_results_list = []

for i in range(len(subset_proportion_list)):
    results_df = final_results[i][0]
    
    results_df = pd.DataFrame(results_df.T.stack()).rename({0: f"Subset Proportion: {subset_proportion_list[i]}"}, axis=1)
    results_df.index = results_df.index.set_names(['Feature Type', 'Metric'])
    
    subset_proportion_results_list.append(results_df)

In [5]:
benchmark_df = pd.concat(subset_proportion_results_list, axis=1)
benchmark_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Subset Proportion: 0.1,Subset Proportion: 0.2,Subset Proportion: 0.3,Subset Proportion: 0.4,Subset Proportion: 0.5,Subset Proportion: 0.6,Subset Proportion: 0.7,Subset Proportion: 0.8,Subset Proportion: 0.9
Feature Type,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
numeric,accuracy,0.965336,0.966736,0.965696,0.966736,0.962058,0.963098,0.965696,0.965696,0.966736
numeric,precision,0.932524,0.932705,0.927904,0.925828,0.919312,0.924,0.926764,0.927904,0.930388
numeric,recall,0.977369,0.980198,0.983027,0.988685,0.983027,0.980198,0.984441,0.983027,0.983027
numeric,f1-score,0.95442,0.955862,0.95467,0.956224,0.950103,0.95127,0.954733,0.95467,0.955983
categorical,accuracy,0.930672,0.93815,0.93815,0.943347,0.941268,0.939709,0.941788,0.943867,0.945426
categorical,precision,0.830612,0.841785,0.844581,0.865263,0.859539,0.854167,0.858333,0.8625,0.863354
categorical,recall,0.892544,0.910088,0.905702,0.901316,0.899123,0.899123,0.903509,0.907895,0.914474
categorical,f1-score,0.860465,0.874605,0.874074,0.882922,0.878885,0.876068,0.880342,0.884615,0.888179
datetime,accuracy,0.996849,0.994802,0.996362,0.995842,0.996881,0.996881,0.995322,0.995322,0.995842
datetime,precision,0.977099,0.964539,0.985507,0.965035,0.985612,0.985612,0.971429,0.978261,0.978417


In [6]:
benchmark_df.to_latex('Results/DescriptiveStatisticSubsetModel.tex')

In [7]:
benchmark_df.to_csv('Results/DescriptiveStatisticSubsetModel.csv')

In [8]:
for i in range(len(subset_proportion_list)):
    cmat_df = final_results[i][1]
    accuracy = np.trace(cmat_df) / np.sum(cmat_df)
    print(accuracy)

0.8860294117647058
0.8924116424116424
0.8913721413721414
0.8986486486486487
0.893970893970894
0.8929313929313929
0.9002079002079002
0.8996881496881497
0.9022869022869023


In [12]:
cmat_10 = pd.DataFrame(final_results[0][1])
label_dict = {v: k for k, v in dict_label_true.items()}
cmat_df = cmat_10
cmat_df = cmat_df.rename(label_dict)
cmat_df = cmat_df.rename(columns=label_dict)
cmat_df.to_latex("Results/10_percent_cmat.tex")
cmat_df

Unnamed: 0,numeric,categorical,datetime,sentence,url,embedded-number,list,not-generalizable,context-specific
numeric,691,2,0,0,0,0,0,6,8
categorical,15,407,0,5,0,4,0,18,7
datetime,0,1,128,0,0,2,0,0,0
sentence,0,8,0,74,0,0,0,2,2
url,0,1,1,0,26,0,1,3,0
embedded-number,0,9,1,0,0,70,0,0,1
list,0,4,0,1,0,3,5,1,4
not-generalizable,5,41,1,1,0,0,0,156,4
context-specific,30,17,0,2,0,0,0,6,130


In [13]:
cmat_50 = pd.DataFrame(final_results[4][1])
label_dict = {v: k for k, v in dict_label_true.items()}
cmat_df = cmat_50
cmat_df = cmat_df.rename(label_dict)
cmat_df = cmat_df.rename(columns=label_dict)
cmat_df.to_latex("Results/50_percent_cmat.tex")
cmat_df

Unnamed: 0,numeric,categorical,datetime,sentence,url,embedded-number,list,not-generalizable,context-specific
numeric,695,3,1,0,0,0,0,2,6
categorical,18,410,0,6,0,1,0,16,5
datetime,1,2,137,0,0,0,0,0,1
sentence,0,5,0,78,0,1,0,4,2
url,0,0,0,0,30,0,1,1,0
embedded-number,0,8,1,0,0,71,0,0,2
list,0,4,0,3,0,4,4,1,3
not-generalizable,4,31,0,1,0,1,0,170,5
context-specific,38,14,0,2,0,0,0,6,125


In [15]:
cmat_90 = pd.DataFrame(final_results[8][1])
label_dict = {v: k for k, v in dict_label_true.items()}
cmat_df = cmat_90
cmat_df = cmat_df.rename(label_dict)
cmat_df = cmat_df.rename(columns=label_dict)
cmat_df.to_latex("Results/90_percent_cmat.tex")
cmat_df

Unnamed: 0,numeric,categorical,datetime,sentence,url,embedded-number,list,not-generalizable,context-specific
numeric,695,2,0,0,0,0,0,2,8
categorical,15,417,0,3,0,0,0,15,6
datetime,1,3,136,0,0,1,0,0,0
sentence,0,5,0,78,0,0,0,5,2
url,0,1,0,0,30,0,0,1,0
embedded-number,0,7,1,0,0,72,0,0,2
list,0,3,0,2,0,4,6,1,3
not-generalizable,4,32,2,1,0,1,0,168,4
context-specific,32,13,0,2,0,0,0,4,134
