In [20]:
import numpy as np
import pandas as pd
import os
import spacy
import torch
from torch import nn, tensor, save, load
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch import optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Optional for numerical features

In [12]:
File_Path = 'eestec_hackathon_2025_train.tsv'
Statement_Column = 'Statement'
Numerical_Columns = [
    'Credit History: barely-true', 'Credit History: false',
    'Credit History: half-true', 'Credit History: mostly-true',
    'Credit History: pants-fire'
]
Label_Column = 'Label'

# batch size 
Batch_Size = 64

# read file  
Data_File = pd.read_csv('eestec_hackathon_2025_train.tsv' ,sep = '\t',names=['ID', 'Label', 'Statement', 'Subjects', 'Speaker Name', 'Speaker Title', 'State', 'Party Affiliation', 'Credit History: barely-true', 'Credit History: false', 'Credit History: half-true', 'Credit History: mostly-true', 'Credit History: pants-fire', 'Context/Location'])

# preprocess the text 
spacy.cli.download("en_core_web_sm")
natural_language_processor = spacy.load("en_core_web_sm")

def tokenize_text(text):
    #Lemmatize every token in set, after tokenizaziotn 
    return [token.lemma_.lower() for token in natural_language_processor(str(text)) if not token.is_punct and not token.is_space and not token.is_stop and not token.is_alpha]

# build the vocabulary
word_count = Counter()
for statement in Data_File[Statement_Column]:
    word_count.update(tokenize_text(statement))

# add PAD token to make same length and UNK (unkown for words not in the vocab)
MAX_VOCAB_SIZE = 10000
MAX_LENGTH = 50
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for i, (word, count) in enumerate(word_count.most_common(MAX_VOCAB_SIZE- 2)): # -2 for pad and unk
    vocab[word] = i + 2

actual_vocab_size = len(vocab)

# numericalize and pad/truncuate text
def numericalize_pad_text(text, vocab_map, max_length):
    tokens = tokenize_text(text)
    numericalized = [vocab_map.get(token, vocab_map[UNK_TOKEN]) for token in tokens]
    if len(numericalized) < max_length:
        # Pad with PAD_TOKEN's index
        numericalized.extend([vocab_map[PAD_TOKEN]] * (max_length - len(numericalized)))
    else:
        # Truncate
        numericalized = numericalized[:max_length]
    return numericalized

# numericalize the statements into a new 'statement_numerical' column
Data_File['statement_numerical'] = Data_File[Statement_Column].apply(lambda x: numericalize_pad_text(x, vocab, MAX_LENGTH))

# map the labels to numerical values
unique_labels = Data_File[Label_Column].astype(str).unique()
label_to_idx = {label: i for i, label in enumerate(unique_labels)}
idx_to_label = {i: label for label, i in label_to_idx.items()}
num_classes = len(unique_labels)

# use mapping for data in new 'label_idx' column
Data_File['label_idx'] = Data_File[Label_Column].astype(str).map(label_to_idx)

for col in Numerical_Columns:
    Data_File[col] = pd.to_numeric(Data_File[col], errors='coerce')
    
    if Data_File[col].isnull().any():
        Data_File[col] = Data_File[col].fillna(0)

# scale the credit history values
# scaler = StandardScaler()
# Data_File[Numerical_Columns] = scaler.fit_transform(Data_File[Numerical_Columns])

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
#Names: Shuming Zhao, Arthur Hennig, Ben Kracht


class LieDetectorModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim_lstm, num_numerical_features, hidden_dim_fc, output_dim, pad_idx):
        super(LieDetectorModel, self).__init__()
        # Embedding layer for text
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # LSTM layer to process sequential text data
        self.lstm = nn.LSTM(embedding_dim, hidden_dim_lstm, batch_first=True, num_layers=1, bidirectional=True) # Using bidirectional LSTM
      
        lstm_output_features = hidden_dim_lstm * 2 

        self.fc1 = nn.Linear(lstm_output_features + num_numerical_features, hidden_dim_fc)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) 
        self.fc2 = nn.Linear(hidden_dim_fc, output_dim)

    def forward(self, text_data, numerical_data):
        embedded = self.embedding(text_data)     
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden_combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        combined_features = torch.cat((hidden_combined, numerical_data), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        # logits shape: (batch_size, output_dim)
        return logits


def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, device, model_save_path='model_weights.pt'):
    """
    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        optimizer (optim.Optimizer): The optimizer to use (e.g., Adam).
        criterion (nn.Module): The loss function (e.g., CrossEntropyLoss).
        num_epochs (int): The number of epochs to train for.
        device (torch.device): The device to train on ('cuda' or 'cpu').
    """
    model.to(device)
    best_val_loss = float('inf')
    best_val_acc = 0.0
    previous_best_model_path = None 

    for epoch in range(num_epochs):
        #Go to training
        model.train()
        running_loss = 0.0
        correct_predictions_train = 0
        total_samples_train = 0

        for i, batch in enumerate(train_loader):
            texts = batch['text'].to(device)
            numerical_feats = batch['numerical'].to(device)
            labels = batch['label'].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(texts, numerical_feats)
            loss = criterion(outputs, labels)

            # backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item() * texts.size(0)

            # calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples_train += labels.size(0)
            correct_predictions_train += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc_train = correct_predictions_train / total_samples_train

        #Go to validation
        model.eval() 
        running_val_loss = 0.0
        correct_predictions_val = 0
        total_samples_val = 0
        #No grad for validation
        with torch.no_grad(): 
            for batch in val_loader:
                texts = batch['text'].to(device)
                numerical_feats = batch['numerical'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts, numerical_feats)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * texts.size(0)

                _, predicted = torch.max(outputs.data, 1)
                total_samples_val += labels.size(0)
                correct_predictions_val += (predicted == labels).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_acc_val = correct_predictions_val / total_samples_val

        print(f"Epoch [{epoch+1}/{num_epochs}] | "
              f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc_train:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_acc_val:.4f}")
        if epoch_val_loss < best_val_loss:
            print(f"Validation loss improved from {best_val_loss:.4f} to {epoch_val_loss:.4f}.")

            # Delete the previous best model file if it exists
            if previous_best_model_path and os.path.exists(previous_best_model_path):
                os.remove(previous_best_model_path)
                print(f"Deleted old checkpoint: {previous_best_model_path}")

            best_val_loss = epoch_val_loss
            best_val_acc = epoch_acc_val

            # Construct a dynamic filename that includes the epoch and validation loss for clarity
            # This ensures unique filenames if you decide to keep top-k, but we're keeping only the best here
            current_model_save_path = f"best_model_epoch_{epoch+1}_val_loss_{best_val_loss:.4f}.pt"

            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_loss': best_val_loss,
                'best_val_acc': best_val_acc,
                'vocab': vocab,
                'label_to_idx': label_to_idx,
                'idx_to_label': idx_to_label,
                'num_classes': num_classes,
                'embedding_dim': model.embedding.embedding_dim,
                'hidden_dim_lstm': model.lstm.hidden_size,
                'num_numerical_features': model.fc1.in_features - (model.lstm.hidden_size * 2),
                'hidden_dim_fc': model.fc1.out_features,
                'pad_idx': model.embedding.padding_idx
            }
            torch.save(checkpoint, current_model_save_path)
            print(f"New best checkpoint saved to {current_model_save_path}")
            
            # Update the path to the currently best model
            previous_best_model_path = current_model_save_path




    print("Finished Training")

In [24]:
NUM_EPOCHS = 42

# split up the data in the validation and training set
Train_df, Val_df = train_test_split(Data_File, test_size=0.2, random_state=42)

# dataset for statements (processing textual data into numerical data)
class TextNumericalDataset(Dataset):
    def __init__(self, dataframe, text_col_numerical, numerical_cols_list, label_col_idx):
        self.texts = torch.tensor(list(dataframe[text_col_numerical].values), dtype=torch.long)
        self.numerical_features = torch.tensor(dataframe[numerical_cols_list].values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe[label_col_idx].values, dtype=torch.long) # Assuming classification

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.texts[idx],
            'numerical': self.numerical_features[idx],
            'label': self.labels[idx]
        }

train_dataset = TextNumericalDataset(Train_df, 'statement_numerical', Numerical_Columns, 'label_idx')
val_dataset = TextNumericalDataset(Val_df, 'statement_numerical', Numerical_Columns, 'label_idx')

# data loader for both datasets
train_loader = DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Batch_Size, shuffle=False)

if __name__ == '__main__':
    # determine device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    num_numerical_features = train_dataset.numerical_features.shape[1]
    print(f"Number of numerical features being used: {num_numerical_features}")
    pad_idx = vocab[PAD_TOKEN]
    model = LieDetectorModel(
        vocab_size=actual_vocab_size,
        embedding_dim=75,
        hidden_dim_lstm=64,
        num_numerical_features=num_numerical_features,
        hidden_dim_fc=128,
        output_dim=num_classes,
        pad_idx=pad_idx
    )

    # Define optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=0.0003)  # Using AdamW optimizer
    # CrossEntropyLoss is suitable for multi-class classification
    criterion = nn.CrossEntropyLoss()

    # start training
    print("Starting training...")
    train_model(model, train_loader, val_loader, optimizer, criterion, NUM_EPOCHS, device)

Using device: cpu
Number of numerical features being used: 5
Starting training...
Epoch [1/42] | Train Loss: 2.1434 | Train Acc: 0.2230 | Val Loss: 1.7463 | Val Acc: 0.3116
Validation loss improved from inf to 1.7463.
New best checkpoint saved to best_model_epoch_1_val_loss_1.7463.pt
Epoch [2/42] | Train Loss: 1.7995 | Train Acc: 0.2621 | Val Loss: 1.6962 | Val Acc: 0.3535
Validation loss improved from 1.7463 to 1.6962.
Deleted old checkpoint: best_model_epoch_1_val_loss_1.7463.pt
New best checkpoint saved to best_model_epoch_2_val_loss_1.6962.pt
Epoch [3/42] | Train Loss: 1.7202 | Train Acc: 0.2839 | Val Loss: 1.6779 | Val Acc: 0.3501
Validation loss improved from 1.6962 to 1.6779.
Deleted old checkpoint: best_model_epoch_2_val_loss_1.6962.pt
New best checkpoint saved to best_model_epoch_3_val_loss_1.6779.pt
Epoch [4/42] | Train Loss: 1.6785 | Train Acc: 0.3056 | Val Loss: 1.6624 | Val Acc: 0.4016
Validation loss improved from 1.6779 to 1.6624.
Deleted old checkpoint: best_model_epoch

In [43]:
# Lie Detector Function Cell
def lie_detector(statement, 
                 subjects, 
                 speaker_name, 
                 speaker_title, 
                 state, 
                 party_affiliation, 
                 history_barely_true, 
                 history_false, 
                 history_half_true, 
                 history_mostly_true, 
                 history_pants_fire, 
                 context_location):
    
    model = LieDetectorModel(
        vocab_size=actual_vocab_size,
        embedding_dim=75,
        hidden_dim_lstm=64,
        num_numerical_features=num_numerical_features,
        hidden_dim_fc=128,
        output_dim=num_classes,
        pad_idx=pad_idx
    )
    
    with open('best_model_epoch_39_val_loss_1.4285.pt', 'rb') as save_file:
     checkpoint = load(save_file, weights_only=False) # Load the full checkpoint dictionary
     model.load_state_dict(checkpoint['model_state_dict']) # Extract model's state_dict
    return ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'][model(numericalize_pad_text(statement, vocab, MAX_VOCAB_SIZE), torch.tensor([history_barely_true, history_false, history_half_true, history_mostly_true, history_pants_fire])).item()[1]]

In [44]:
# read file  
Data_File = pd.read_csv('eestec_hackathon_2025_train.tsv' ,sep = '\t',names=['ID', 'Label', 'Statement', 'Subjects', 'Speaker Name', 'Speaker Title', 'State', 'Party Affiliation', 'Credit History: barely-true', 'Credit History: false', 'Credit History: half-true', 'Credit History: mostly-true', 'Credit History: pants-fire', 'Context/Location'])
a, b, c, d, e, f, g, h, i, j, k, l = Data_File.iloc[0].values[2:]
lie_detector(a, b, c, d, e, f, g, h, i, j, k, l)

ValueError: too many dimensions 'str'