In [4]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Directory containing the training dataset
train_directory = r"C:\Users\HP\Downloads\proj\dataset\train\boxes_transcripts_labels"

# List of all files in the training directory
train_files = [os.path.join(train_directory, f) for f in os.listdir(train_directory) if f.endswith('.tsv')]

# Initialize an empty list to store the data
train_dataframes = []

# Loop through each file in the training set
for file in train_files:
    df = pd.read_csv(file, header=None, delimiter=",")  # Adjust delimiter for .tsv files
    selected_columns = df.iloc[:, [0, 1, 2, 3, 4, 5, 7]]  # Columns with features and label
    train_dataframes.append(selected_columns)

# Combine all training files into a single DataFrame
combined_train_data = pd.concat(train_dataframes, ignore_index=True)

# Rename columns for clarity
combined_train_data.columns = ['start_index', 'end_index', 'top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y', 'label_text']

# Drop rows with missing values
combined_train_data = combined_train_data.dropna()

# Convert class labels (label_text) to numeric using LabelEncoder
label_encoder = LabelEncoder()
combined_train_data['class_label'] = label_encoder.fit_transform(combined_train_data['label_text'])

# Features and Labels for training
X_train = combined_train_data[['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y']]
y_train = combined_train_data['class_label']

# One-hot encode the labels for neural network
y_train_encoded = to_categorical(y_train)

# Build the neural network model
model = Sequential(
    [
        Dense(64, activation='relu', input_dim=X_train.shape[1]),
        Dense(32, activation='relu'),
        Dense(y_train_encoded.shape[1], activation='softmax')  # Output layer size matches number of classes
    ]
)

# Compile the model
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

# Train the model
model.fit(X_train, y_train_encoded, epochs=10)

# Directory containing the validation dataset
val_directory = r"C:\Users\HP\Downloads\proj\dataset\val_w_ann\boxes_transcripts_labels"

# List of all files in the validation directory
val_files = [os.path.join(val_directory, f) for f in os.listdir(val_directory) if f.endswith('.tsv')]

# Initialize an empty list to store the validation data
val_dataframes = []

# Loop through each file in the validation set
for file in val_files:
    df = pd.read_csv(file, header=None, delimiter=",")  # Adjust delimiter for .tsv files
    selected_columns = df.iloc[:, [0, 1, 2, 3, 4, 5, 7]]  # Columns with features and label
    val_dataframes.append(selected_columns)

# Combine all validation files into a single DataFrame
combined_val_data = pd.concat(val_dataframes, ignore_index=True)

# Rename columns for clarity
combined_val_data.columns = ['start_index', 'end_index', 'top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y', 'label_text']

# Drop rows with missing values
combined_val_data = combined_val_data.dropna()

# Convert class labels (label_text) to numeric using LabelEncoder
combined_val_data['class_label'] = label_encoder.fit_transform(combined_val_data['label_text'])

# Features and Labels for validation
X_val = combined_val_data[['top_left_x', 'top_left_y', 'bottom_right_x', 'bottom_right_y']]
y_val = combined_val_data['class_label']

# Use the model to predict on the validation set
y_pred = model.predict(X_val)

# Convert probabilities to class labels (argmax)
y_pred_classes = y_pred.argmax(axis=-1)

# Calculate precision, recall, and F1 score for each class
precision, recall, f1, _ = precision_recall_fscore_support(
    y_val, y_pred_classes, average=None  # Class-wise metrics
)

# Decode the class labels back to their original entity names
class_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

# Prepare the metrics data
metrics = []
for i, class_name in enumerate(class_names):
    metrics.append([class_name, precision[i], recall[i], f1[i]])

# Convert the metrics into a DataFrame
metrics_df = pd.DataFrame(metrics, columns=['Field', 'Precision', 'Recall', 'F1-Score'])

# Now ensure that the 'start_index' and 'end_index' are aligned correctly to the rows
metrics_df['start_index'] = combined_val_data['start_index'].values[:len(metrics_df)]
metrics_df['end_index'] = combined_val_data['end_index'].values[:len(metrics_df)]

# Save the metrics to a .tsv file
metrics_df.to_csv('metrics_with_indices.tsv', sep='\t', index=False)

# Print the metrics to verify
print("Metrics with indices saved to metrics_with_indices.tsv:")
print(metrics_df)

# Evaluate model accuracy on validation set
accuracy = accuracy_score(y_val, y_pred_classes)
print(f"Accuracy on the validation set: {accuracy * 100:.2f}%")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 942us/step - loss: 3.7904
Epoch 2/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 933us/step - loss: 0.3625
Epoch 3/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 923us/step - loss: 0.2890
Epoch 4/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 936us/step - loss: 0.2789
Epoch 5/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 938us/step - loss: 0.2691
Epoch 6/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 937us/step - loss: 0.2695
Epoch 7/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 927us/step - loss: 0.2673
Epoch 8/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 936us/step - loss: 0.2647
Epoch 9/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 935us/step - loss: 0.2652
Epoch 10/10
[1m7426/7426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

  _warn_prf(average, modifier, msg_start, len(result))
