In [3]:
import numpy as np
import pandas as pd
import spacy
import torch
from torch import nn, tensor, save, load
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch import optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Optional for numerical features

In [45]:
FILE_PATH = 'eestec_hackathon_2025_train.tsv'
HISTORY_COLUMNS = [
    'Credit History: barely-true', 'Credit History: false',
    'Credit History: half-true', 'Credit History: mostly-true',
    'Credit History: pants-fire'
]
STATEMENT_COLUMN = 'Statement'
LABEL_COLUMN = 'Label'
STATE_COLUMN = 'State'
SPEAKER_COLUMN = 'Speaker Name'
CONTEXT_COLUMN = 'Context/Location'

STATEMENT_NUMERICAL = 'statement_numerical'
CONTEXT_NUMERICAL = 'context_numerical'
SPEAKER_NUMERICAL = 'speaker_numerical'
STATE_NUMERICAL = 'state_numerical'
LABEL_NUMERICAL = 'label_numerical'

# batch size 
Batch_Size = 64

# read file  
Data_File = pd.read_csv(FILE_PATH ,sep = '\t',names=['ID', 'Label', 'Statement', 'Subjects', 'Speaker Name', 'Speaker Title', 'State', 'Party Affiliation', 'Credit History: barely-true', 'Credit History: false', 'Credit History: half-true', 'Credit History: mostly-true', 'Credit History: pants-fire', 'Context/Location'])

# preprocess the text
spacy.cli.download("en_core_web_sm")
natural_language_processor = spacy.load("en_core_web_sm")

def tokenize_text(text):
    #Lemmatize every token in set, after tokenizaziotn 
    return [token.lemma_.lower() for token in natural_language_processor(str(text)) if not token.is_punct and not token.is_space and not token.is_stop and not token.is_alpha]

# build the vocabulary for the statement
word_count = Counter()
for statement in Data_File[STATEMENT_COLUMN]:
    word_count.update(tokenize_text(statement))

# build the vocabulary for the context/location
context_count = Counter()
for context in Data_File[CONTEXT_COLUMN]:
    context_count.update(tokenize_text(context))

# add PAD token to make same length and UNK (unkown for words not in the vocab)
MAX_STATEMENT_SIZE = 10000
MAX_STATEMENT_LENGTH = 50
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
vocab_map = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for i, (word, count) in enumerate(word_count.most_common(MAX_STATEMENT_SIZE- 2)): # -2 for pad and unk
    vocab_map[word] = i + 2
vocab_size = len(vocab_map)

MAX_CONTEXT_SIZE = 500
MAX_CONTEXT_LENGTH = 10
context_map = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for i, (context, count) in enumerate(context_count.most_common(MAX_CONTEXT_SIZE-2)):
    context_map[context] = i+2
context_size = len(context_map)

# numericalize and pad/truncuate text
def numericalize_pad_text(text, vocab, max_length):
    tokens = tokenize_text(text)
    numericalized = [vocab.get(token, vocab[UNK_TOKEN]) for token in tokens]
    if len(numericalized) < max_length:
        # Pad with PAD_TOKEN's index
        numericalized.extend([vocab[PAD_TOKEN]] * (max_length - len(numericalized)))
    else:
        # Truncate
        numericalized = numericalized[:max_length]
    return numericalized

# numericalize the statements into a new 'statement_numerical' and 'context_numerical' columns
Data_File[STATEMENT_NUMERICAL] = Data_File[STATEMENT_COLUMN].apply(lambda statement: numericalize_pad_text(text=statement, vocab=vocab_map, max_length=MAX_STATEMENT_LENGTH))
Data_File[CONTEXT_NUMERICAL] = Data_File[CONTEXT_COLUMN].apply(lambda context: numericalize_pad_text(text=context, vocab=context_map, max_length=MAX_CONTEXT_LENGTH))

# map the labels to numerical values
unique_labels = Data_File[LABEL_COLUMN].astype(str).unique()
label_to_idx = {label: i for i, label in enumerate(unique_labels)}
idx_to_label = {i: label for label, i in label_to_idx.items()}
num_classes = len(unique_labels)

# map the states to numerical values
unique_states = Data_File[STATE_COLUMN].astype(str).unique()
state_to_idx = {state: i for i, state in enumerate(unique_states)}
idx_to_state = {i: state for state, i in state_to_idx.items()}
num_states = len(unique_states)

# map the speakers to numerical values
unique_speakers = Data_File[SPEAKER_COLUMN].astype(str).unique()
speaker_to_idx = {speaker: i for i, speaker in enumerate(unique_speakers)}
idx_to_speaker = {i: speaker for speaker, i in speaker_to_idx.items()}
num_speakers = len(unique_speakers)

# use mappings for data in new 'label_idx', 'state_idx', 'speaker_idx' column
Data_File[LABEL_NUMERICAL] = Data_File[LABEL_COLUMN].astype(str).map(label_to_idx)
Data_File[STATE_NUMERICAL] = Data_File[STATE_COLUMN].astype(str).map(state_to_idx)
Data_File[SPEAKER_NUMERICAL] = Data_File[SPEAKER_COLUMN].astype(str).map(speaker_to_idx)

for col in HISTORY_COLUMNS:
    Data_File[col] = pd.to_numeric(Data_File[col], errors='coerce')
    
    if Data_File[col].isnull().any():
        Data_File[col] = Data_File[col].fillna(0)

# scale the credit history values
# scaler = StandardScaler()
# Data_File[Numerical_Columns] = scaler.fit_transform(Data_File[Numerical_Columns])

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [46]:
class LieDetectorModel(nn.Module):
    def __init__(self, vocab_size, context_size, statement_embedding_dim, context_embedding_dim, speaker_embedding_dim, states_embedding_dim, hidden_dim_lstm1, hidden_dim_lstm2, num_history_features, num_speakers, num_states, hidden_dim_fc, output_dim, pad_idx):
        super(LieDetectorModel, self).__init__()
        # Embedding layer for statements
        self.statement_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=statement_embedding_dim, padding_idx=pad_idx)
        # Embedding layer for contexts
        self.context_embedding = nn.Embedding(num_embeddings=context_size, embedding_dim=context_embedding_dim, padding_idx=pad_idx)
        # LSTM layer to process sequential statement data
        self.lstm1 = nn.LSTM(statement_embedding_dim, hidden_dim_lstm1, batch_first=True, num_layers=1, bidirectional=True) # Using bidirectional LSTM
        # LSTM layer to process sequential statement data
        self.lstm2 = nn.LSTM(context_embedding_dim, hidden_dim_lstm2, batch_first=True, num_layers=1, bidirectional=True) # Using bidirectional LSTM
        
        self.speaker_embedding = nn.Embedding(num_embeddings=num_speakers, embedding_dim=speaker_embedding_dim)
        self.states_embedding = nn.Embedding(num_embeddings=num_states, embedding_dim=states_embedding_dim)

        lstm_output_features = hidden_dim_lstm1 * 2 + hidden_dim_lstm2 * 2
        combined_dim = lstm_output_features + num_history_features + speaker_embedding_dim + states_embedding_dim

        self.fc1 = nn.Linear(combined_dim, hidden_dim_fc)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) 
        self.fc2 = nn.Linear(hidden_dim_fc, output_dim)

    def forward(self, statement_data, context_data, history_data, speaker_data, state_data):
        embedded_statements = self.statement_embedding(statement_data)
        embedded_contexts = self.context_embedding(context_data)
        embedded_speakers = self.speaker_embedding(speaker_data)
        embedded_states = self.states_embedding(state_data)
        lstm1_out, (hidden1, cell) = self.lstm1(embedded_statements)
        lstm2_out, (hidden2, cell) = self.lstm2(embedded_contexts)
        hidden1_combined = torch.cat((hidden1[-2,:,:], hidden1[-1,:,:]), dim=1)
        hidden2_combined = torch.cat((hidden2[-2,:,:], hidden2[-1,:,:]), dim=1)
        combined_features = torch.cat((hidden1_combined, hidden2_combined, history_data, embedded_speakers, embedded_states), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        # logits shape: (batch_size, output_dim)
        return logits


def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, device):
    """
    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        optimizer (optim.Optimizer): The optimizer to use (e.g., Adam).
        criterion (nn.Module): The loss function (e.g., CrossEntropyLoss).
        num_epochs (int): The number of epochs to train for.
        device (torch.device): The device to train on ('cuda' or 'cpu').
    """
    model.to(device)
    for epoch in range(num_epochs):
        #Go to training
        model.train()
        running_loss = 0.0
        correct_predictions_train = 0
        total_samples_train = 0

        for i, batch in enumerate(train_loader):
            statements = batch['statement'].to(device)
            history_feats = batch['history'].to(device)
            labels = batch['label'].to(device)
            speakers = batch['speaker'].to(device)
            states = batch['state'].to(device)
            contexts = batch['context'].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(statements, contexts, history_feats, speakers, states)
            loss = criterion(outputs, labels)

            # backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item() * statements.size(0)

            # calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples_train += labels.size(0)
            correct_predictions_train += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc_train = correct_predictions_train / total_samples_train

        # go to validation
        model.eval()
        running_val_loss = 0.0
        correct_predictions_val = 0
        total_samples_val = 0
        # no grad for validation
        with torch.no_grad(): 
            for batch in val_loader:
                statements = batch['statement'].to(device)
                history_feats = batch['history'].to(device)
                labels = batch['label'].to(device)
                speakers = batch['speaker'].to(device)
                states = batch['state'].to(device)
                contexts = batch['context'].to(device)

                outputs = model(statements, contexts, history_feats, speakers, states)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * statements.size(0)

                _, predicted = torch.max(outputs.data, 1)
                total_samples_val += labels.size(0)
                correct_predictions_val += (predicted == labels).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_acc_val = correct_predictions_val / total_samples_val

        print(f"Epoch [{epoch+1}/{num_epochs}] | "
              f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc_train:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_acc_val:.4f}")

    print("Finished Training")

In [48]:
NUM_EPOCHS = 120

# split up the data in the validation and training set
Train_df, Val_df = train_test_split(Data_File, test_size=0.2, random_state=42)

# dataset for statements (processing textual data into numerical data)
class TextNumericalDataset(Dataset):
    def __init__(self, dataframe):
        self.statements = torch.tensor(list(dataframe[STATEMENT_NUMERICAL].values), dtype=torch.long)
        self.history_features = torch.tensor(dataframe[HISTORY_COLUMNS].values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe[LABEL_NUMERICAL].values, dtype=torch.long)
        self.speakers = torch.tensor(dataframe[SPEAKER_NUMERICAL].values, dtype=torch.long)
        self.states = torch.tensor(dataframe[STATE_NUMERICAL].values, dtype=torch.long)
        self.contexts = torch.tensor(list(dataframe[CONTEXT_NUMERICAL].values), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'statement': self.statements[idx],
            'history': self.history_features[idx],
            'label': self.labels[idx],
            'speaker': self.speakers[idx],
            'state': self.states[idx],
            'context': self.contexts[idx]
        }

train_dataset = TextNumericalDataset(Train_df)
val_dataset = TextNumericalDataset(Val_df)

# data loader for both datasets
train_loader = DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Batch_Size, shuffle=False)

if __name__ == '__main__':
    # determine device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    num_history_features = train_dataset.history_features.shape[1]
    print(f"Number of numerical features being used: {num_history_features}")
    pad_idx = vocab_map[PAD_TOKEN]
    model = LieDetectorModel(
        vocab_size=vocab_size,
        context_size=context_size,
        statement_embedding_dim=100,
        context_embedding_dim=16,
        speaker_embedding_dim=15,
        states_embedding_dim=12,
        hidden_dim_lstm1=64,
        hidden_dim_lstm2=16,
        num_history_features=num_history_features,
        num_speakers=num_speakers,
        num_states=num_states,
        hidden_dim_fc=128,
        output_dim=num_classes,
        pad_idx=pad_idx
    )

    # Define optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=0.0001)  # Using AdamW optimizer
    # CrossEntropyLoss is suitable for multi-class classification
    criterion = nn.CrossEntropyLoss()

    # start training
    print("Starting training...")
    train_model(model, train_loader, val_loader, optimizer, criterion, NUM_EPOCHS, device)

Using device: cpu
Number of numerical features being used: 5
Starting training...
Epoch [1/120] | Train Loss: 2.2410 | Train Acc: 0.1657 | Val Loss: 1.8448 | Val Acc: 0.2116
Epoch [2/120] | Train Loss: 1.9441 | Train Acc: 0.2253 | Val Loss: 1.7509 | Val Acc: 0.2536
Epoch [3/120] | Train Loss: 1.8634 | Train Acc: 0.2388 | Val Loss: 1.7353 | Val Acc: 0.2579
Epoch [4/120] | Train Loss: 1.7936 | Train Acc: 0.2532 | Val Loss: 1.7149 | Val Acc: 0.2942
Epoch [5/120] | Train Loss: 1.7698 | Train Acc: 0.2511 | Val Loss: 1.7131 | Val Acc: 0.3107
Epoch [6/120] | Train Loss: 1.7370 | Train Acc: 0.2666 | Val Loss: 1.7022 | Val Acc: 0.2955
Epoch [7/120] | Train Loss: 1.7194 | Train Acc: 0.2750 | Val Loss: 1.7006 | Val Acc: 0.2756
Epoch [8/120] | Train Loss: 1.7057 | Train Acc: 0.2782 | Val Loss: 1.6870 | Val Acc: 0.3211
Epoch [9/120] | Train Loss: 1.6875 | Train Acc: 0.2993 | Val Loss: 1.6815 | Val Acc: 0.3107
Epoch [10/120] | Train Loss: 1.6831 | Train Acc: 0.2897 | Val Loss: 1.6769 | Val Acc: 0.32

KeyboardInterrupt: 