In [None]:

class LieDetectorModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim_lstm, num_numerical_features, hidden_dim_fc, output_dim, pad_idx):
        super(LieDetectorModel, self).__init__()
        # Embedding layer for text
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # LSTM layer to process sequential text data
        self.lstm = nn.LSTM(embedding_dim, hidden_dim_lstm, batch_first=True, num_layers=1, bidirectional=True) # Using bidirectional LSTM
        
        # Calculate the input size for the first fully connected layer
        # LSTM output is (batch_size, seq_len, 2 * hidden_dim_lstm) because bidirectional=True
        # We'll take the final hidden state of the LSTM (or an aggregation like mean/max pooling)
        # For simplicity, let's use the concatenation of the final forward and backward hidden states
        # The hidden state shape is (num_layers * num_directions, batch_size, hidden_dim_lstm)
        # So, the output from LSTM to be used will be 2 * hidden_dim_lstm
        lstm_output_features = hidden_dim_lstm * 2 # For bidirectional LSTM

        # Fully connected layers
        # Input to fc1 will be the concatenated LSTM output and numerical features
        self.fc1 = nn.Linear(lstm_output_features + num_numerical_features, hidden_dim_fc)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) # Dropout for regularization
        self.fc2 = nn.Linear(hidden_dim_fc, output_dim) # Output layer

    def forward(self, text_data, numerical_data):
        # text_data shape: (batch_size, seq_len)
        # numerical_data shape: (batch_size, num_numerical_features)

        embedded = self.embedding(text_data)
        # embedded shape: (batch_size, seq_len, embedding_dim)

        # Pass embedded text through LSTM
        # outputs shape: (batch_size, seq_len, hidden_dim_lstm * 2)
        # hidden shape: (num_layers * 2, batch_size, hidden_dim_lstm)
        # cell shape: (num_layers * 2, batch_size, hidden_dim_lstm)
        lstm_out, (hidden, cell) = self.lstm(embedded)

        # We can use the final hidden state.
        # Concatenate the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden states
        hidden_combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden_combined shape: (batch_size, hidden_dim_lstm * 2)

        # Concatenate LSTM output (text features) with numerical features
        # Ensure numerical_data is on the same device and has the correct shape
        combined_features = torch.cat((hidden_combined, numerical_data), dim=1)
        # combined_features shape: (batch_size, (hidden_dim_lstm * 2) + num_numerical_features)
        
        # Pass through fully connected layers
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        # logits shape: (batch_size, output_dim)
        return logits

# --- Training Function ---
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, device):
    """
    Trains a PyTorch model.

    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        optimizer (optim.Optimizer): The optimizer to use (e.g., Adam).
        criterion (nn.Module): The loss function (e.g., CrossEntropyLoss).
        num_epochs (int): The number of epochs to train for.
        device (torch.device): The device to train on ('cuda' or 'cpu').
    """
    model.to(device) # Move model to the specified device

    for epoch in range(num_epochs):
        model.train() # Set model to training mode
        running_loss = 0.0
        correct_predictions_train = 0
        total_samples_train = 0

        for i, batch in enumerate(train_loader):
            texts = batch['text'].to(device)
            numerical_feats = batch['numerical'].to(device)
            labels = batch['label'].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(texts, numerical_feats)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * texts.size(0) # Accumulate loss

            # Calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples_train += labels.size(0)
            correct_predictions_train += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc_train = correct_predictions_train / total_samples_train

        # Validation phase
        model.eval() # Set model to evaluation mode
        running_val_loss = 0.0
        correct_predictions_val = 0
        total_samples_val = 0
        with torch.no_grad(): # No gradients needed for validation
            for batch in val_loader:
                texts = batch['text'].to(device)
                numerical_feats = batch['numerical'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts, numerical_feats)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * texts.size(0)

                _, predicted = torch.max(outputs.data, 1)
                total_samples_val += labels.size(0)
                correct_predictions_val += (predicted == labels).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_acc_val = correct_predictions_val / total_samples_val

        print(f"Epoch [{epoch+1}/{num_epochs}] | "
              f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc_train:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_acc_val:.4f}")

    print("Finished Training")

if __name__ == '__main__':
    # Determine device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    num_numerical_features = train_dataset.numerical_features.shape[1]
    


    print(f"Number of numerical features being used: {num_numerical_features}")


    pad_idx = vocab[PAD_TOKEN]
    model = SimpleTextNN(
        vocab_size=actual_Vocab_Size,
        embedding_dim=Embedding_Dim,
        hidden_dim_lstm=Hidden_Dim_LSTM,
        num_numerical_features=num_numerical_features,
        hidden_dim_fc=Hidden_Dim_FC,
        output_dim=num_classes,
        pad_idx=pad_idx
    )

    # Define optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=Learning_Rate)
    # CrossEntropyLoss is suitable for multi-class classification
    criterion = nn.CrossEntropyLoss()

    # Start training
    print("Starting training...")
    train_model(model, train_loader, val_loader, optimizer, criterion, Num_Epochs, device)


In [None]:
import pandas as pd
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Optional for numerical features
import numpy as np

File_Path = 'eestec_hackathon_2025_train.tsv'
Text_Column = 'Statement'
Numerical_Columns = [
    'Credit History: barely-true', 'Credit History: false',
    'Credit History: half-true', 'Credit History: mostly-true',
    'Credit History: pants-fire'
]
Label_Column = 'Label'


#Batch size 
Batch_Size = 64

#Read file  
Data_File = pd.read_csv('eestec_hackathon_2025_train.tsv' ,sep = '\t',names=['ID', 'Label', 'Statement', 'Subjects', 'Speaker Name', 'Speaker Title', 'State', 'Party Affiliation', 'Credit History: barely-true', 'Credit History: false', 'Credit History: half-true', 'Credit History: mostly-true', 'Credit History: pants-fire', 'Context/Location'])

#Split up the data in the validation and training set
Train_df, Val_df = train_test_split(Data_File, test_size=0.2, random_state=42)


# Preprocess the text 
natural_language_processor = spacy.load("en_core_web_sm")

def tokenize_text(text):
    #Lemmatize every token in set, after tokenizaziotn 
    return [token.lemma_.lower() for token in natural_language_processor(str(text)) if not token.is_punct and not token.is_space]

#Now we build the vocabulary 

word_count = Counter()
for statement in Train_df[Text_Column]:
    word_count.update(tokenize_text(statement))

# add PAD token to make same length and UNK(unkown for words not in the vocab)
Max_Vocab_Size = 10000
Max_Length = 50  
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for i, (word, count) in enumerate(word_count.most_common(Max_Vocab_Size- 2)): # -2 for pad and unk
    vocab[word] = i + 2

actual_Vocab_Size = len[vocab]

#Numericalize and pad/truncuate text
def numericalize_pad_text(text, vocab_map, max_len):
    tokens = tokenize_text(text)
    numericalized = [vocab_map.get(token, vocab_map[UNK_TOKEN]) for token in tokens]
    if len(numericalized) < max_len:
        # Pad with PAD_TOKEN's index
        numericalized.extend([vocab_map[PAD_TOKEN]] * (max_len - len(numericalized)))
    else:
        # Truncate
        numericalized = numericalized[:max_len]
    return numericalized

# Apply to Statement column
Train_df['statement_numerical'] = Train_df[Text_Column].apply(lambda x: numericalize_pad_text(x, vocab, Max_Length))
Val_df['statement_numerical'] = Val_df[Text_Column].apply(lambda x: numericalize_pad_text(x, vocab, Max_Length))

#Later maybe use scalar?
#scaler = StandardScaler()
#Train_df[Numerical_Columns] = scaler.fit_transform(Train_df[Numerical_Columns])
#Val_df[Numerical_Columns] = scaler.transform(Val_df[Numerical_Columns]) # Use transform for validation

# Mapping the labels to numbers 
unique_labels = Train_df[Label_Column].astype(str).unique()
label_to_idx = {label: i for i, label in enumerate(unique_labels)}
idx_to_label = {i: label for label, i in label_to_idx.items()}
num_classes = len(unique_labels)

#Use mapping for data 
Train_df['label_idx'] = Train_df[Label_Column].astype(str).map(label_to_idx)
Val_df['label_idx'] = Val_df[Label_Column].astype(str).map(label_to_idx)

#Verify please 

class TextNumericalDataset(Dataset):
    def __init__(self, dataframe, text_col_numerical, numerical_cols_list, label_col_idx):
        self.texts = torch.tensor(list(dataframe[text_col_numerical].values), dtype=torch.long)
        self.numerical_features = torch.tensor(dataframe[numerical_cols_list].values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe[label_col_idx].values, dtype=torch.long) # Assuming classification

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.texts[idx],
            'numerical': self.numerical_features[idx],
            'label': self.labels[idx]
        }

train_dataset = TextNumericalDataset(Train_df, 'statement_numerical', Numerical_Columns, 'label_idx')
val_dataset = TextNumericalDataset(Val_df, 'statement_numerical', Numerical_Columns, 'label_idx')

#Data loader 
train_loader = DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Batch_Size, shuffle=False)








