In [1]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

from Load_Predictions import *
from downstream_models import *
from Featurize import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
testdf = pd.read_csv('Benchmark-Labeled-Data/data_test.csv')
test_metadata = pd.read_csv('../RawCSV/Metadata/meta_data.csv')

print(len(testdf),len(test_metadata))
test_merged = pd.merge(testdf,test_metadata,on='Record_id')

y_true = test_merged.y_act.values.tolist()

1985 4233


In [3]:
test_merged

Unnamed: 0,Record_id,Attribute_name,y_act,total_vals,num_nans,%_nans,num_of_dist_val,%_dist_val,mean,std_dev,...,mean_char_count,stdev_char_count,mean_whitespace_count,stdev_whitespace_count,mean_delim_count,stdev_delim_count,is_list,is_long_sentence,name,link
0,285,Net Income-Cont. Operations,numeric,1781,0,0.000000,1693,95.058956,1.748059e+09,4.333643e+09,...,10.2,0.748331,0.0,0.000000,0.0,0.000000,False,False,fundamentals.csv,https://www.kaggle.com/dgawlik/nyse/data
1,285,Earnings Before Interest and Tax,numeric,1781,0,0.000000,1717,96.406513,2.710102e+09,6.143620e+09,...,10.2,0.748331,0.0,0.000000,0.0,0.000000,False,False,fundamentals.csv,https://www.kaggle.com/dgawlik/nyse/data
2,285,After Tax ROE,numeric,1781,0,0.000000,154,8.646828,4.360135e+01,2.338583e+02,...,2.4,0.489898,0.0,0.000000,0.0,0.000000,False,False,fundamentals.csv,https://www.kaggle.com/dgawlik/nyse/data
3,285,For Year,categorical,1781,173,9.713644,6,0.336889,2.013305e+03,1.994693e+01,...,4.0,0.000000,0.0,0.000000,0.0,0.000000,False,False,fundamentals.csv,https://www.kaggle.com/dgawlik/nyse/data
4,285,Equity Earnings/Loss Unconsolidated Subsidiary,numeric,1781,0,0.000000,467,26.221224,9.134297e+07,7.242807e+08,...,7.4,3.200000,0.0,0.000000,0.0,0.000000,False,False,fundamentals.csv,https://www.kaggle.com/dgawlik/nyse/data
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1980,328,item,categorical,227092,772,0.339950,49,0.021577,0.000000e+00,0.000000e+00,...,47.2,38.295692,6.0,5.403702,6.0,5.403702,False,False,official_country_data_table_4_1_total_economy_...,https://www.kaggle.com/unitednations/national-...
1981,343,load_weight,numeric,564403,59436,10.530773,5228,0.926288,1.157945e+04,7.774468e+03,...,4.0,0.632456,0.0,0.000000,0.0,0.000000,False,False,austin_waste_and_diversion.csv,https://www.kaggle.com/jboysen/austin-waste/data
1982,1970,SpecialNotes,sentence,232,93,40.086207,125,53.879310,0.000000e+00,0.000000e+00,...,314.4,339.556534,46.6,49.717603,46.6,49.717603,False,True,full_data.csv,https://www.kaggle.com/uddipta/world-bank-unem...
1983,2050,Website,url,331,133,40.181269,181,54.682779,0.000000e+00,0.000000e+00,...,17.2,7.909488,0.0,0.000000,0.0,0.000000,False,False,times-square-food-beverage-locations.csv,https://www.kaggle.com/new-york-city/ny-times-...


In [4]:
dict_label_true = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

y_true = [dict_label_true[str(i)] for i in y_true]

In [5]:
table_names = ['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
           '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
           'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
           'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
           'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
           'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
           'is_list', 'is_long_sentence', 
           'sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']

In [6]:
sample_size_list = [0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]
rand_seed = 99

results = []

for sample_percent in sample_size_list:
    
    df_data = pd.DataFrame()
    
    prv_csv_name, csv_name = '', ''
    
    exception_indices = []
    error_count = 0

    for index, row in test_merged.iterrows():  
        
        if index%50==0:
            print(index)
            
        col = str(row['Attribute_name'])
        prv_csv_name = csv_name
        csv_name = '../RawCSV/RawCSVFiles/' + row['name']

        if prv_csv_name != csv_name:
            df = pd.read_csv(csv_name,encoding='latin1')

        try:
            total_vals=df[[col]].shape[0]
            sample_size = int(total_vals*sample_percent)

            df_sample = df[[col]].sample(n=sample_size, random_state=rand_seed)

            dataFeaturized = FeaturizeFile(df_sample)
            dataFeaturized1 = ProcessStats(dataFeaturized)
            dataFeaturized2 = FeatureExtraction(dataFeaturized,dataFeaturized1,0)
            dataFeaturized2 = dataFeaturized2.fillna(0)

            dataFeaturized2.insert(0, 'y_act', row['y_act'])
            dataFeaturized2.insert(0, 'Attribute_name', row['Attribute_name'])
            dataFeaturized2.insert(0, 'Record_id', row['Record_id'])
            labeled_row = dataFeaturized2.iloc[:, :28]

            sample_value_df = df[[col]].sample(5, random_state=rand_seed)
            for sample_value in range(len(sample_value_df[col].values)):
                col_name = f"sample_{sample_value+1}"
                labeled_row.insert(12, col_name, sample_value_df[col].values[sample_value])

            df_data = df_data.append(labeled_row)
            
        except Exception as e: 
            print('Error')
            error_count += 1
            
            
    df_data = df_data[table_names]
    print(df_data.columns)
    print(error_count)
    df_data.to_csv(f"DescriptiveStatisticSubset-Data/{sample_percent}_sample_data_test.csv", index=False)


0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
61
0


  interactivity=interactivity, compiler=compiler, result=result)


50


  interactivity=interactivity, compiler=compiler, result=result)


100


  interactivity=interactivity, compiler=compiler, result=result)


150
200
250
300
350
400
450
500
550
600
650


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


700


  interactivity=interactivity, compiler=compiler, result=result)


750


  interactivity=interactivity, compiler=compiler, result=result)


800


  interactivity=interactivity, compiler=compiler, result=result)


850


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


900


  interactivity=interactivity, compiler=compiler, result=result)


950
1000
Error
1050


  interactivity=interactivity, compiler=compiler, result=result)


Error


  interactivity=interactivity, compiler=compiler, result=result)


1100
1150
1200


  interactivity=interactivity, compiler=compiler, result=result)


1250


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1300


  interactivity=interactivity, compiler=compiler, result=result)


Error
1350
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1400


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1450
Error


  interactivity=interactivity, compiler=compiler, result=result)


1500
1550
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1600
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1650
Error


  interactivity=interactivity, compiler=compiler, result=result)


1700
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
1750


  interactivity=interactivity, compiler=compiler, result=result)


1800
Error
Error
Error
Error


  interactivity=interactivity, compiler=compiler, result=result)


1850
Error


  interactivity=interactivity, compiler=compiler, result=result)


1900
Error
Error
Error
1950


  interactivity=interactivity, compiler=compiler, result=result)


Error
Index(['Record_id', 'Attribute_name', 'y_act', 'total_vals', 'num_nans',
       '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val',
       'max_val', 'has_delimiters', 'has_url', 'has_email', 'has_date',
       'mean_word_count', 'std_dev_word_count', 'mean_stopword_total',
       'stdev_stopword_total', 'mean_char_count', 'stdev_char_count',
       'mean_whitespace_count', 'stdev_whitespace_count', 'mean_delim_count',
       'stdev_delim_count', 'is_list', 'is_long_sentence', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5'],
      dtype='object')
81


In [7]:
df_data

Unnamed: 0,Record_id,Attribute_name,y_act,total_vals,num_nans,%_nans,num_of_dist_val,%_dist_val,mean,std_dev,...,stdev_whitespace_count,mean_delim_count,stdev_delim_count,is_list,is_long_sentence,sample_1,sample_2,sample_3,sample_4,sample_5
0,285,Net Income-Cont. Operations,numeric,178,0,0.000000,178,100.000000,1.790829e+09,3.216023e+09,...,0.000000,0.0,0.000000,False,False,-1.182e+09,1.41957e+09,3.81195e+08,2.6464e+08,2.417e+08
0,285,Earnings Before Interest and Tax,numeric,178,0,0.000000,178,100.000000,2.786374e+09,5.176141e+09,...,0.000000,0.0,0.000000,False,False,-9.118e+08,1.82393e+09,6.5243e+08,5.54224e+08,4.269e+08
0,285,After Tax ROE,numeric,178,0,0.000000,61,34.269663,2.823596e+01,4.733351e+01,...,0.000000,0.0,0.000000,False,False,60,36,14,13,14
0,285,For Year,categorical,178,20,11.235955,5,2.808989,2.013835e+03,1.118661e+00,...,0.000000,0.0,0.000000,False,False,2013,2012,2014,,2013
0,285,Equity Earnings/Loss Unconsolidated Subsidiary,numeric,178,0,0.000000,51,28.651685,5.644310e+07,3.449883e+08,...,0.000000,0.0,0.000000,False,False,0,0,-2.105e+06,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,327,sub_group,categorical,10004,533,5.327869,3,0.029988,0.000000e+00,0.000000e+00,...,3.072458,3.4,3.072458,False,False,Individual consumption expenditure of households,Individual consumption expenditure of households,Individual consumption expenditure of households,Individual consumption expenditure of non-prof...,Individual consumption expenditure of households
0,328,item,categorical,22709,88,0.387512,48,0.211370,0.000000e+00,0.000000e+00,...,1.788854,3.0,1.788854,False,False,NET LENDING (+) / NET BORROWING (-),Insurance technical reserves,Taxes on production and imports,NET DOMESTIC PRODUCT,Social contributions
0,343,load_weight,numeric,56440,5945,10.533310,2236,3.961729,1.153968e+04,7.429942e+03,...,0.000000,0.0,0.000000,False,False,10640,12900,24940,12920,12400
0,1970,SpecialNotes,sentence,23,6,26.086957,17,73.913043,0.000000e+00,0.000000e+00,...,7.657676,9.6,7.657676,False,True,,Europe and Central Asia regional aggregate (do...,Fiscal year end: March 31; reporting period fo...,European Union aggregate.,Lower middle income group aggregate. Lower-mid...
