In [1]:
import pandas as pd
import json
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from pathlib import Path
from tqdm import tqdm

# GPU if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 1e-3
EMB_DIM = 64     
HID_DIM = 128
MAX_LEN = 200    

In [2]:
TRAIN_FILE = Path('train_syntactic.tsv')
VAL_FILE = Path('val_syntactic.tsv')
TEST_FILE = Path('test_syntactic.tsv')
VOCAB_FILE = Path('pos_vocab.json')


with VOCAB_FILE.open('r', encoding='utf-8') as f:
    pos_vocab = json.load(f)
PAD_IDX = pos_vocab['<PAD>']
VOCAB_SIZE = len(pos_vocab)

train_df = pd.read_csv(TRAIN_FILE, sep='\t')
test_df = pd.read_csv(TEST_FILE,  sep='\t')

In [3]:
class SyntacticDataset(Dataset):
    def __init__(self, df, label_map):
        self.ids = df['pos_ids'].apply(json.loads).tolist()
        self.labels = df['region'].map(label_map).tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return (torch.tensor(self.ids[idx],dtype=torch.long),torch.tensor(self.labels[idx], dtype=torch.long))

regions = sorted(train_df['region'].unique())
label_map = {r:i for i,r in enumerate(regions)}

df_val = pd.read_csv(VAL_FILE, sep='\t')
train_ds = SyntacticDataset(train_df, label_map)
val_ds = SyntacticDataset(df_val, label_map)
test_ds = SyntacticDataset(test_df, label_map)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [4]:
class SyntacticRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, out_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=1,
            nonlinearity='relu',
            bidirectional=True,
            batch_first=True
        )
        self.fc = nn.Linear(hid_dim*2, out_dim)
        
    def forward(self, x):
        emb = self.embedding(x)
        _, h = self.rnn(emb)
        h_fwd = h[-2]
        h_bwd = h[-1]
        h_cat = torch.cat([h_fwd, h_bwd], dim=1)
        return self.fc(h_cat)

model = SyntacticRNN(VOCAB_SIZE, EMB_DIM, HID_DIM, len(regions), PAD_IDX).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [5]:
for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    train_losses = []
    for X, y in tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} [Train]"):
        X, y = X.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    avg_loss = sum(train_losses)/len(train_losses)

    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for X, y in tqdm(val_loader, desc="Validating"):
            X = X.to(DEVICE)
            preds = model(X).argmax(dim=1).cpu().tolist()
            val_preds.extend(preds)
            val_trues.extend(y.tolist())
    acc = accuracy_score(val_trues, val_preds)
    prec, rec, f1, _ = (precision_recall_fscore_support(val_trues, val_preds, average='weighted'))
    print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | Val Acc: {acc:.4f} | Val F1: {f1:.4f}")

Epoch 1/10 [Train]:   0%|          | 3/1466 [00:16<2:10:26,  5.35s/it]


KeyboardInterrupt: 

In [None]:
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for X, y in tqdm(test_loader, desc="Testing"):
        X = X.to(DEVICE)
        preds = model(X).argmax(dim=1).cpu().tolist()
        test_preds.extend(preds)
        test_trues.extend(y.tolist())

# Compute metrics
acc = accuracy_score(test_trues, test_preds)
prec, rec, f1, _ = (precision_recall_fscore_support(test_trues, test_preds, average='weighted'))
print(f"Test → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

output = pd.DataFrame({
    'textid': test_df['textid'],
    'true_region': test_df['region'],
    'pred_region': [regions[i] for i in test_preds]
})
output.to_csv('stest_predictions.tsv', sep='\t', index=False)