In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [26]:
df = pd.DataFrame()

# Combine the dataframes
for filename in listdir("dataset\\dataset\\train\\boxes_transcripts_labels\\"):
    df_temp = pd.read_csv("dataset\\dataset\\train\\boxes_transcripts_labels\\" + filename, sep = ",", header = None)
    df_temp.columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right','transcript','field']
    #df_temp_trimmed = df_temp.loc[df_temp['field'] != 'OTHER']

    df = pd.concat([df, df_temp], ignore_index = True)

In [27]:
# Now we build the model and the preprocessing functions based on our analysis
def Feature_Engineering(df, type = 'train'):
    df['x_center'] = df[['x_top_left', 'x_bottom_right']].mean(axis=1)
    df['y_center'] = df[['y_top_left', 'y_bottom_right']].mean(axis=1)
    df['index_len'] = df['end_index'] - df['start_index']

    # We select 5 features told to us by RFECV and the label
    df = df[['start_index', 'end_index', 'x_center', 'y_center', 'index_len', 'field']]

    return df

def X_y_split(df):
    y = df[['field']]
    X = df[['start_index', 'end_index', 'x_center', 'y_center', 'index_len']]
    return X,y


In [28]:
def Hierarchical_Training(df, model1, model2, type = "Not Complete"):
    df1 = Feature_Engineering(df)
    df2 = df1.copy()

    # For 1st model, we convert relevant columns as "not other"  & We reduce the number of "OTHER" columns to reduce skewness
    df1.loc[df["field"] != "OTHER", "field"] = 'NOT_OTHER'

    # For 2nd model, we simply drop all rows with 'OTHER' as label
    df2 = df2.loc[df2['field'] != 'OTHER']

    if type == "Final":
        model1 = Train_Model(df1, model1, type)
        model2 = Train_Model(df2, model2, type)
        return model1, model2
    
    else:
        model1, y_pred_1, y_test_1 = Train_Model(df1, model1)
        model2, y_pred_2, y_test_2 = Train_Model(df2, model2)

        return model1, model2, y_pred_1, y_pred_2, y_test_1, y_test_2


def Train_Model(df, model, type = "Not Complete"):
    X,y = X_y_split(df)
    
    if type == "Final":
        model.fit(X,y)
        return model
    
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        return model, y_pred, y_test





def Cross_Validation(df, model):
    
    X,y = X_y_split(df)

    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    accu_stratified = []
    
    for train_index, test_index in skf.split(X, y):
        X_train_fold, X_test_fold = X.values[train_index], X.values[test_index]
        y_train_fold, y_test_fold = y.values[train_index], y.values[test_index]
        model.fit(X_train_fold, y_train_fold.ravel())
        accu_stratified.append(model.score(X_test_fold, y_test_fold))
    
    return(accu_stratified)

In [29]:
# This will help us choose the hyperparameters for our models using Cross Validation
new_df = df.copy()
for i in [1,2,4,8,16,32,64,100,200,300]:
        df1 = Feature_Engineering(new_df)
        df2 = df1.copy()

        # For 1st model, we convert relevant columns as "not other"
        df1.loc[df["field"] != "OTHER", "field"] = 'NOT_OTHER'

        # For 2nd model, we simply drop all rows with 'OTHER' as label
        df2 = df2.loc[df2['field'] != 'OTHER']

        model = RandomForestClassifier(n_estimators = i)
        cv_score = Cross_Validation(df1, model)

        # Print the output.
        print('iteration value: ', i)
        print('\nAverage Accuracy That can be obtained from this model is:',
        np.mean(cv_score)*100, '%')

iteration value:  1

Average Accuracy That can be obtained from this model is: 97.5426726992226 %
iteration value:  2

Average Accuracy That can be obtained from this model is: 97.20431209936227 %
iteration value:  4

Average Accuracy That can be obtained from this model is: 98.29346482013014 %
iteration value:  8

Average Accuracy That can be obtained from this model is: 98.62382986741606 %
iteration value:  16

Average Accuracy That can be obtained from this model is: 98.71389090191464 %
iteration value:  32

Average Accuracy That can be obtained from this model is: 98.74755905805958 %
iteration value:  64

Average Accuracy That can be obtained from this model is: 98.76397197733067 %


KeyboardInterrupt: 