In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [33]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Directory containing the dataset
directory = r"C:\Users\HP\Downloads\proj\dataset\train\boxes_transcripts_labels"

# List of all files in the directory
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.tsv')]

# Initialize an empty list to store the data
dataframes = []

# Loop through each file and load the required columns
for file in files:
    # Load the file, assuming no header in the file
    df = pd.read_csv(file, header=None, delimiter=",")  # Specify delimiter if it's a tab-separated file (.tsv)

    # Select columns 2, 3, 4, 5 for features and column 7 for the textual labels
    selected_columns = df.iloc[:, [2, 3, 4, 5, 7]]  # Adjusting indices to match the correct columns
    dataframes.append(selected_columns)

# Combine all files into a single dataframe
combined_data = pd.concat(dataframes, ignore_index=True)

# Rename columns for clarity
combined_data.columns = ['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y', 'label_text']

combined_data


Unnamed: 0,top_left_x,top_left_y,bottom_right_x,bottom_right_y,label_text
0,215,4,227,21,OTHER
1,235,3,308,21,OTHER
2,311,3,349,20,OTHER
3,352,3,401,20,OTHER
4,404,3,457,21,OTHER
...,...,...,...,...,...
237611,365,1064,385,1079,OTHER
237612,388,1064,429,1079,OTHER
237613,451,1063,482,1078,OTHER
237614,485,1063,509,1078,OTHER


In [34]:
combined_data.dtypes

top_left_x         int64
top_left_y         int64
bottom_right_x     int64
bottom_right_y     int64
label_text        object
dtype: object

In [37]:
# Check for missing values
print(combined_data.isnull().sum())

# Drop rows with missing values, if any
combined_data = combined_data.dropna()

# Convert class labels (label_text) to numeric using LabelEncoder
label_encoder = LabelEncoder()
combined_data['class_label'] = label_encoder.fit_transform(combined_data['label_text'])

# Print the label mapping
print("Label Mapping:", dict(enumerate(label_encoder.classes_)))

# Optionally, print the first few rows of the data to confirm
print(combined_data.head())

top_left_x        0
top_left_y        0
bottom_right_x    0
bottom_right_y    0
label_text        0
dtype: int64
Label Mapping: {0: 'OTHER', 1: 'box16StateWagesTips', 2: 'box17StateIncomeTax', 3: 'box1WagesTipsAndOtherCompensations', 4: 'box2FederalIncomeTaxWithheld', 5: 'box3SocialSecurityWages', 6: 'box4SocialSecurityTaxWithheld', 7: 'einEmployerIdentificationNumber', 8: 'employeeName', 9: 'employerAddressCity', 10: 'employerAddressState', 11: 'employerAddressStreet_name', 12: 'employerAddressZip', 13: 'employerName', 14: 'ssnOfEmployee', 15: 'taxYear'}
   top_left_x  top_left_y  bottom_right_x  bottom_right_y label_text  \
0         215           4             227              21      OTHER   
1         235           3             308              21      OTHER   
2         311           3             349              20      OTHER   
3         352           3             401              20      OTHER   
4         404           3             457              21      OTHER   

   cl

In [38]:
# Features: Columns 3, 4, 5, 6
X = combined_data[['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y']]

# Labels: Column 'class_label'
y = combined_data['class_label']

# Split into training and testing datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (190092, 4)
Testing set shape: (47524, 4)


In [39]:
print(combined_data.dtypes)


top_left_x         int64
top_left_y         int64
bottom_right_x     int64
bottom_right_y     int64
label_text        object
class_label        int32
dtype: object


In [40]:
print(combined_data['bottom_right_y'].unique())


[  21   20   23 ... 2514 2518 2554]


In [42]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Assuming y_train is already encoded using LabelEncoder, let's one-hot encode the labels
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

# Build the model
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1]),  # First hidden layer
    Dense(32, activation='relu'),  # Second hidden layer
    Dense(y_train_encoded.shape[1], activation='softmax')  # Output layer with softmax
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_encoded, epochs=20, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f"Test accuracy: {accuracy * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 6ms/step - accuracy: 0.9078 - loss: 2.6430 - val_accuracy: 0.9372 - val_loss: 0.3523
Epoch 2/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5ms/step - accuracy: 0.9375 - loss: 0.3551 - val_accuracy: 0.9361 - val_loss: 0.3790
Epoch 3/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5ms/step - accuracy: 0.9361 - loss: 0.3372 - val_accuracy: 0.9375 - val_loss: 0.3286
Epoch 4/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5ms/step - accuracy: 0.9366 - loss: 0.3301 - val_accuracy: 0.9371 - val_loss: 0.3244
Epoch 5/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5ms/step - accuracy: 0.9365 - loss: 0.3286 - val_accuracy: 0.9367 - val_loss: 0.3321
Epoch 6/20
[1m5347/5347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5ms/step - accuracy: 0.9368 - loss: 0.3249 - val_accuracy: 0.9381 - val_loss: 0.3173
Epoch 7/20

## testing the model using test_val_ans datasets

In [66]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Directory containing the dataset with answers of the test
directory = r"C:\Users\HP\Downloads\proj\dataset\val_w_ann\boxes_transcripts_labels"

# List of all files in the directory
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.tsv')]

# Initialize an empty list to store the data
dataframes = []

# Loop through each file and load the required columns (features and labels)
for file in files:
    df = pd.read_csv(file, header=None, delimiter=",")  # Adjust delimiter for .tsv files
    selected_columns = df.iloc[:, [2, 3, 4, 5, 7]]  # Columns with features and label
    dataframes.append(selected_columns)

# Combine all files into a single DataFrame
test_combined_data_Answers = pd.concat(dataframes, ignore_index=True)

# Rename columns for clarity
test_combined_data_Answers.columns = ['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y', 'label_text']

# Check for missing values
print(test_combined_data_Answers.isnull().sum())

# Drop rows with missing values, if any
test_combined_data_Answers = test_combined_data_Answers.dropna()

# Convert class labels (label_text) to numeric using LabelEncoder
label_encoder = LabelEncoder()
test_combined_data_Answers['class_label'] = label_encoder.fit_transform(test_combined_data_Answers['label_text'])

# Now, split the data into X_test (features) and y_test (labels)
X_test = test_combined_data_Answers[['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y']]
y_test = test_combined_data_Answers['class_label']


# Use the model to predict on the test set
y_pred = model.predict(X_test)

# Since softmax returns probabilities, we need to get the index of the max value (the predicted class)
y_pred_classes = y_pred.argmax(axis=-1)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy on the test set: {accuracy * 100:.2f}%")

# Optionally, print the first few predictions
print("Predictions for the test set (first few):")
print(y_pred_classes[:50])

# If you want to decode the predicted classes back to their original labels
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
print("Predicted Labels for the test set (first few):")
print(y_pred_labels[:50])


top_left_x        0
top_left_y        0
bottom_right_x    0
bottom_right_y    0
label_text        0
dtype: int64
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step
Accuracy on the test set: 93.60%
Predictions for the test set (first few):
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  7  0  0  0  0  0  0  0  0  0  0
  0  0]
Predicted Labels for the test set (first few):
['OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER'
 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'ssnOfEmployee' 'OTHER' 'OTHER'
 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER'
 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER'
 'OTHER' 'OTHER' 'einEmployerIdentificationNumber' 'OTHER' 'OTHER' 'OTHER'
 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER' 'OTHER']
