In [1]:
# Import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pycaret
from pycaret.classification import setup,compare_models,save_model,evaluate_model,predict_model,create_model
import warnings
warnings.filterwarnings('ignore')

import os
import pickle

In [2]:
tsv_directory  = "./dataset/train/boxes_transcripts_labels"
dirlist = os.listdir(tsv_directory )

# Total no of files
print(len(dirlist))

600


In [3]:
# Create an empty DataFrame to store the combined data
traindf = pd.DataFrame()

# Iterate through the list of file names
for file in dirlist:
    tsv_file_path  = os.path.join(tsv_directory , file)  
    df = pd.read_csv(tsv_file_path, header=None, names=['start_index','end_index','x_top_left','y_top_left',
                                              'x_bottom_right','y_bottom_right', 'transcript','field'])

    # Concatenate the current DataFrame to org_dataframe
    traindf = pd.concat([traindf, df], ignore_index=True)

# Print the final DataFrame
traindf.head()

Unnamed: 0,start_index,end_index,x_top_left,y_top_left,x_bottom_right,y_bottom_right,transcript,field
0,33,33,215,4,227,21,a,OTHER
1,35,44,235,3,308,21,Employee's,OTHER
2,46,51,311,3,349,20,social,OTHER
3,53,60,352,3,401,20,security,OTHER
4,62,67,404,3,457,21,number,OTHER


In [4]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237616 entries, 0 to 237615
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   start_index     237616 non-null  int64 
 1   end_index       237616 non-null  int64 
 2   x_top_left      237616 non-null  int64 
 3   y_top_left      237616 non-null  int64 
 4   x_bottom_right  237616 non-null  int64 
 5   y_bottom_right  237616 non-null  int64 
 6   transcript      237558 non-null  object
 7   field           237616 non-null  object
dtypes: int64(6), object(2)
memory usage: 14.5+ MB


In [5]:

# Drop the 'transcript' column from train_dataframe
traindf.drop(columns=['transcript'], inplace=True)
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237616 entries, 0 to 237615
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   start_index     237616 non-null  int64 
 1   end_index       237616 non-null  int64 
 2   x_top_left      237616 non-null  int64 
 3   y_top_left      237616 non-null  int64 
 4   x_bottom_right  237616 non-null  int64 
 5   y_bottom_right  237616 non-null  int64 
 6   field           237616 non-null  object
dtypes: int64(6), object(1)
memory usage: 12.7+ MB


In [6]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'field' column
traindf['field'] = label_encoder.fit_transform(traindf['field'])

# Print the final DataFrame
traindf.head()

Unnamed: 0,start_index,end_index,x_top_left,y_top_left,x_bottom_right,y_bottom_right,field
0,33,33,215,4,227,21,0
1,35,44,235,3,308,21,0
2,46,51,311,3,349,20,0
3,53,60,352,3,401,20,0
4,62,67,404,3,457,21,0


In [7]:
s = setup(traindf,target='field')


Unnamed: 0,Description,Value
0,Session id,3913
1,Target,field
2,Target type,Multiclass
3,Original data shape,"(237616, 7)"
4,Transformed data shape,"(237616, 7)"
5,Transformed train set shape,"(166331, 7)"
6,Transformed test set shape,"(71285, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


In [8]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9869,0.997,0.9869,0.9866,0.9864,0.8864,0.8886,1.491
xgboost,Extreme Gradient Boosting,0.9869,0.9964,0.9869,0.9866,0.9864,0.8868,0.8887,24.439
rf,Random Forest Classifier,0.9858,0.9959,0.9858,0.9855,0.9852,0.8763,0.8788,2.765
dt,Decision Tree Classifier,0.9796,0.9264,0.9796,0.9796,0.9795,0.8326,0.8326,0.148
gbc,Gradient Boosting Classifier,0.975,0.9834,0.975,0.9739,0.9729,0.7664,0.7756,68.808
knn,K Neighbors Classifier,0.9719,0.9524,0.9719,0.97,0.9702,0.7477,0.7522,0.594
ridge,Ridge Classifier,0.9366,0.0,0.9366,0.878,0.9059,0.0003,0.0044,0.057
dummy,Dummy Classifier,0.9366,0.5,0.9366,0.8772,0.9059,0.0,0.0,0.039
lr,Logistic Regression,0.9345,0.824,0.9345,0.8854,0.9084,0.0827,0.131,14.64
lda,Linear Discriminant Analysis,0.9247,0.874,0.9247,0.8869,0.9042,0.1106,0.1265,0.089


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [9]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [18]:
test_dir  = r"E:\infrrd\dataset\val\boxes_transcripts"
dirlist = os.listdir(test_dir )

# Total no of files
print(len(dirlist))

207


In [23]:
testdf = pd.DataFrame()

# Iterate through the list of file names
for file in dirlist:
    tsv_file_path  = os.path.join(test_dir , file)  
    df = pd.read_csv(tsv_file_path, header=None, names=['start_index','end_index','x_top_left','y_top_left',
                                              'x_bottom_right','y_bottom_right', 'transcript','field'])

    # Concatenate the current DataFrame to org_dataframe
    testdf = pd.concat([testdf, df], ignore_index=True)

# Print the final DataFrame
testdf.head()

Unnamed: 0,start_index,end_index,x_top_left,y_top_left,x_bottom_right,y_bottom_right,transcript,field
0,33,33,216,22,225,36,a,
1,35,44,235,21,309,36,Employee's,
2,46,51,311,20,350,35,social,
3,53,60,353,19,402,34,security,
4,62,67,405,19,456,33,number,


In [24]:
testdf.drop(['transcript','field'],axis=1,inplace=True)
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80453 entries, 0 to 80452
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   start_index     80453 non-null  int64
 1   end_index       80453 non-null  int64
 2   x_top_left      80453 non-null  int64
 3   y_top_left      80453 non-null  int64
 4   x_bottom_right  80453 non-null  int64
 5   y_bottom_right  80453 non-null  int64
dtypes: int64(6)
memory usage: 3.7 MB


In [25]:
predict_model(best_model,testdf)

Unnamed: 0,start_index,end_index,x_top_left,y_top_left,x_bottom_right,y_bottom_right,prediction_label,prediction_score
0,33.0,33.0,216.0,22.0,225.0,36.0,0,1.00
1,35.0,44.0,235.0,21.0,309.0,36.0,0,1.00
2,46.0,51.0,311.0,20.0,350.0,35.0,0,1.00
3,53.0,60.0,353.0,19.0,402.0,34.0,0,1.00
4,62.0,67.0,405.0,19.0,456.0,33.0,0,1.00
...,...,...,...,...,...,...,...,...
80448,2335.0,2337.0,364.0,1064.0,386.0,1078.0,0,1.00
80449,2339.0,2344.0,389.0,1063.0,430.0,1078.0,0,1.00
80450,2346.0,2348.0,451.0,1063.0,482.0,1077.0,0,1.00
80451,2350.0,2352.0,484.0,1064.0,508.0,1077.0,0,0.99


In [26]:
best_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 3913,
 'verbose': 0,
 'warm_start': False}

In [30]:
# Saving the best model 
save_model(best_model,'xgboost')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=C:\Users\REHANS~1\AppData\Local\Temp\joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['start_index', 'end_index',
                                              'x_top_left', 'y_top_left',
                                              'x_bottom_right',
                                              'y_bottom_right'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='d...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
     

In [None]:
import joblib

# Load the saved model from the .pkl file
model = joblib.load('xgboost.pkl')

#### Next step will be prediction for this we will be using val directory 

In [122]:
# Load the TSV file
val_path = './dataset/val/boxes_transcripts'
val_dir = os.listdir(val_path)


In [99]:
mapping = {
    0: 'OTHER',
    1: 'box16StateWagesTips',
    2: 'box17StateIncomeTax',
    3: 'box1WagesTipsAndOtherCompensations',
    4: 'box2FederalIncomeTaxWithheld',
    5: 'box3SocialSecurityWages',
    6: 'box4SocialSecurityTaxWithheld',
    7: 'einEmployerIdentificationNumber',
    8: 'employeeName',
    9: 'employerAddressCity',
    10: 'employerAddressState',
    11: 'employerAddressStreet_name',
    12: 'employerAddressZip',
    13: 'employerName',
    14: 'ssnOfEmployee',
    15: 'taxYear'
}


In [102]:
# Iterate through each TSV file in the val directory
output_dir = './dataset/pred_val'

new_column_names = ['start_index', 'end_index','x_top_left','y_top_left','x_bottom_right','y_bottom_right','transcript']

for file in dirlist:
        valfile_path  = os.path.join(val_path , file)
        valdf = pd.read_csv(valfile_path, sep=',', header=None)
        valdf.columns = new_column_names

        # excluding 'Transcript column'
        valdf_input = valdf.iloc[:, :6]
        

        # Make predictions on the input features
        pred = model.predict(valdf_input)

        # Add the predictions as a new column to the DataFrame
        valdf['fields'] = pred

        # Map the key to columns value
        valdf['fields'] = valdf['fields'].map(mapping)
        
        # Save the DataFrame to a CSV file, excluding the 'transcript' column
        output_path = os.path.join(output_dir, f'{file}')
        valdf.drop(columns=['transcript']).to_csv(output_path, sep=',', index=False, header=False)
        

            

valdf.head()

Unnamed: 0,start_index,end_index,x_top_left,y_top_left,x_bottom_right,y_bottom_right,transcript,fields
0,6,11,99,20,146,34,"Wages,",OTHER
1,13,17,149,21,177,34,"tips,",OTHER
2,19,23,180,21,214,34,other,OTHER
3,25,29,216,21,254,34,comp.,OTHER
4,43,43,324,19,334,34,2,OTHER


# Now we will run eval.py along with val_w_ann

In [121]:
import os
import csv
import pandas as pd

'''
Entities:
1. employerName
2. employerAddressStreet_name
3. employerAddressCity
4. employerAddressState
5. employerAddressZip
6. einEmployerIdentificationNumber
7. employeeName
8. ssnOfEmployee
9. box1WagesTipsAndOtherCompensations
10. box2FederalIncomeTaxWithheld
11. box3SocialSecurityWages
12. box4SocialSecurityTaxWithheld
13. box16StateWagesTips
14. box17StateIncomeTax
15. taxYear
'''



'''
Description: The fuction yields the standard precision, recall and f1 score metrics

arguments:
    TP -> int
    FP -> int
    FN -> int

returns: float, float, float
'''
def performance(TP, FP, FN):
    
    if (TP+FP) == 0:
        precision = "NaN"
    else:
        precision = TP/float((TP+FP))
        
    if (TP+FN) == 0:
        recall = "NaN"
    else:
        recall = TP/float((TP+FN))
    
    if (recall!="NaN") and (precision!="NaN"):
        f1_score = (2.0*precision*recall)/(precision+recall)
    else:
        f1_score = "NaN"
    
    return precision, recall, f1_score
    
    
    
    
'''
Description: The fuction yields a dataframe containing entity-wise performance metrics

arguments:
    true_labels -> list
    pred_labels -> lisyt
    
returns: pandas dataframe
'''
def get_dataset_metrics(true_labels, pred_labels):
    
    metrics_dict = dict()
    
    for true_label, pred_label in zip(true_labels, pred_labels):
        if true_label not in metrics_dict:
            metrics_dict[true_label] = {"TP":0, "FP":0, "FN":0, "Support":0}
        
        if true_label != "OTHER":
            metrics_dict[true_label]["Support"] += 1
            
            if true_label == pred_label:
                metrics_dict[true_label]["TP"] += 1
            
            elif pred_label == "OTHER":
                metrics_dict[true_label]["FN"] += 1
            
        else:
            if pred_label != "OTHER":
                metrics_dict[pred_label]["FP"] += 1
           
    df = pd.DataFrame()
    
    for field in metrics_dict:
        precision, recall, f1_score = performance(metrics_dict[field]["TP"], metrics_dict[field]["FP"], metrics_dict[field]["FN"])
        support = metrics_dict[field]["Support"]
        
        if field != "OTHER":
            temp_df = pd.DataFrame([[precision, recall, f1_score, support]], columns=["Precision", "Recall", "F1-Score", "Support"], index=[field])
            df = df.append(temp_df)
    
    return df




'''
Description: The fuction yields a dataframe containing entity-wise performance metrics for a single document
(make sure the doc id is the same)

arguments:
    doc_true -> tsv file with with labels in the last column (8 th column (1-indexed))
    doc_pred -> tsv file with labels in the last column (8 th column (1-indexed)), as predicted by the model
    
returns: list, list
'''
def get_doc_labels(doc_true, doc_pred):

    true_labels = [row[-1] for row in csv.reader(open(doc_true, "r"))]
    pred_labels = [row[-1] for row in csv.reader(open(doc_pred, "r"))]

    return true_labels, pred_labels



'''
Description: The fuction yields a dataframe containing entity-wise performance metrics for all documents
(make sure the doc ids are the same in both the paths)

arguments:
    doc_true -> string (directory containing the ground truth tsv files)
    doc_pred -> string (directory containing the predicted tsv files)
    save -> bool (saves the metrics file in your working directory)
returns: pandas dataframe
'''
def get_dataset_labels(true_path, pred_path, save=False):
    
    y_true, y_pred = [], []
    
    for true_file in os.listdir(true_path):

        for pred_file in os.listdir(pred_path):

            if (".tsv" in true_file) and (".tsv" in pred_file):
                if true_file == pred_file:
                    
                    true_file, pred_file = f"{true_path}/{true_file}", f"{pred_path}/{pred_file}"
                    true_labels, pred_labels = get_doc_labels(true_file, pred_file)
                    
                    y_true.extend(true_labels)
                    y_pred.extend(pred_labels)
            
    df = get_dataset_metrics(y_true, y_pred)
    print(df)
    if save == True:
        df.to_csv("eval_metrics.tsv")
    
        



if __name__ == "__main__":
    
    # template to run your own evaluation

    doc_true = r'E:\infrrd\dataset\val_w_ann\boxes_transcripts_labels'
    doc_pred = r'E:\infrrd\dataset\pred_val'

    get_dataset_labels(doc_true, doc_pred, save=False)

        
        
        
    
    
    


                                    Precision    Recall  F1-Score  Support
ssnOfEmployee                        0.987654  0.930233  0.958084      173
einEmployerIdentificationNumber      1.000000  0.942708  0.970509      195
box1WagesTipsAndOtherCompensations   0.982659  0.952381  0.967283      359
box2FederalIncomeTaxWithheld         0.978202  0.975543  0.976871      368
box3SocialSecurityWages              0.984076  0.880342  0.929323      353
box4SocialSecurityTaxWithheld        0.965418  0.933148  0.949008      359
employerName                         0.982512  0.919643  0.950038      696
employerAddressStreet_name           0.994667  0.971354  0.982872      786
employerAddressCity                  0.993007  0.965986  0.979310      305
employerAddressState                 0.994350  0.936170  0.964384      196
employerAddressZip                   0.994764  0.964467  0.979381      199
employeeName                         0.969945  0.883085  0.924479      404
box17StateIncomeTax      

=> From the above matrix we can conclude that precision of all the columns are upto the mark .
while Recall of the some of the columns are less than 90 % .
So basically if we want to focus more on minimising False Negatives, we would want our Recall to be as close to 100% as possible, we can acheive this by adjusting the model's threshold or using more relevant features.


==> Support Analysis: The support value represents the number of instances of each entity in your dataset. Entities with low support may be prone to overfitting, collecting more data for entities with low support can allow to acheive high support which can enhance model perfromance.