In [3]:
#Lexical RNN
import json
from pathlib import Path

import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# use gpu first if available 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 1e-3

In [4]:
TRAIN_FILE = Path("train_trainready.tsv")
VAL_FILE = Path("val_trainready.tsv")
TEST_FILE = Path("test_trainready.tsv")
VOCAB_FILE = Path("vocab.json")


with open(VOCAB_FILE, "r", encoding="utf-8") as f:
    vocab = json.load(f)
PAD_IDX = vocab["<PAD>"]
VOCAB_SIZE = len(vocab)

In [6]:
class LexicalDataset(Dataset):
    def __init__(self, path, label_map):
        df = pd.read_csv(path, sep="\t")
        self.inputs = df["input_ids"].apply(json.loads).tolist()
        labels = df["region"]
        self.labels = labels.map(label_map).tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return (torch.tensor(self.inputs[idx], dtype=torch.long),torch.tensor(self.labels[idx], dtype=torch.long))

# Build region to index map from train split
train_df = pd.read_csv(TRAIN_FILE, sep="\t")
train_full = pd.read_csv("train_trainready.tsv", sep="\t")

regions = sorted(train_df["region"].unique())
region2idx = {r:i for i,r in enumerate(regions)}

# Create datasets & loaders
train_ds = LexicalDataset(TRAIN_FILE,region2idx)
val_ds = LexicalDataset(VAL_FILE, region2idx)
test_ds = LexicalDataset(TEST_FILE, region2idx)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)





In [4]:
class LexicalRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, out_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=1,
            nonlinearity="relu",
            bidirectional=True,
            batch_first=True
        )
        self.fc = nn.Linear(hid_dim * 2, out_dim)

    def forward(self, x):
        emb = self.embedding(x)           
        out, h = self.rnn(emb)           
        h_fwd = h[-2]                       
        h_bwd = h[-1]                       
        h_cat = torch.cat([h_fwd, h_bwd], dim=1)  
        return self.fc(h_cat)             

# Instantiate model, optimizer, loss
EMB_DIM = 300
HID_DIM = 128
NUM_CLASSES = len(region2idx)

model = LexicalRNN(VOCAB_SIZE, EMB_DIM, HID_DIM, NUM_CLASSES, PAD_IDX).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()


In [5]:
from tqdm import tqdm

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    train_losses = []
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} [Train]", leave=False)
    for X, y in loop:
        X, y = X.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        loop.set_postfix(train_loss=loss.item())

    model.eval()
    val_preds, val_trues = [], []
    loop = tqdm(val_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} [Val]  ", leave=False)
    with torch.no_grad():
        for X, y in loop:
            X = X.to(DEVICE)
            preds = model(X).argmax(dim=1).cpu().tolist()
            val_preds.extend(preds)
            val_trues.extend(y.tolist())

    acc = accuracy_score(val_trues, val_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(val_trues, val_preds, average="weighted")
    print(f"Epoch {epoch} ▶ Train Loss: {sum(train_losses)/len(train_losses):.4f} | Val F1: {f1:.4f} (Acc: {acc:.4f})")


                                                                                       

KeyboardInterrupt: 

In [1]:
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for X, y in test_loader:
        X = X.to(DEVICE)
        preds = model(X).argmax(dim=1).cpu().tolist()
        test_preds.extend(preds)
        test_trues.extend(y.tolist())

acc = accuracy_score(test_trues, test_preds)
prec, rec, f1, _ = precision_recall_fscore_support(test_trues, test_preds, average="weighted")
print(f"Test Performance → Acc: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

# Save 
output = pd.DataFrame({
    "textid": test_ds["textid"],
    "true_region": test_ds["region"],
    "pred_region": [list(region2idx.keys())[i] for i in test_preds]
})
output.to_csv("test_predictions.tsv", sep="\t", index=False)


NameError: name 'model' is not defined