In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
def preprocess_text(text_series):
    return (text_series
            .str.lower()
            .replace(r'[^a-z0-9\s]', ' ', regex=True)
            .str.strip()
            .str.split()
            .apply(lambda tokens: ' '.join(tokens)))

def preprocess_df(df):
    df = df.drop(columns=['EM'])
    df['Description'] = preprocess_text(df['Description'])
    df['EN'] = preprocess_text(df['EN'])
    df['combined_text'] = df['Description'] + ' ' + df['EN']
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['Composition strategy']
    )
    train_df, val_df = train_test_split(
        train_df, test_size=0.2, random_state=42, stratify=train_df['Composition strategy']
    )
    return train_df, val_df, test_df


In [4]:
def load_glove_embeddings(filepath):
    glove_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_dict[word] = vector
    return glove_dict

def sentence_to_embedding_sequence(sentence, glove_dict, embedding_dim=100, max_len=50):
    tokens = sentence.split()
    sequence = []
    for token in tokens:
        if token in glove_dict:
            sequence.append(glove_dict[token])
        else:
            sequence.append(np.zeros(embedding_dim, dtype=np.float32))
    # Pad or truncate the sequence to max_len
    if len(sequence) < max_len:
        padding = [np.zeros(embedding_dim, dtype=np.float32)] * (max_len - len(sequence))
        sequence.extend(padding)
    else:
        sequence = sequence[:max_len]
    return np.stack(sequence)  # Shape: (max_len, embedding_dim)

In [5]:
class ELCoGloveRNNDataset(Dataset):
    def __init__(self, df, glove_dict, embedding_dim=100, max_len=50):
        self.sentences = df['combined_text'].tolist()
        self.labels = df['Composition strategy'].tolist()
        self.glove_dict = glove_dict
        self.embedding_dim = embedding_dim
        self.max_len = max_len
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        # Convert the sentence to a sequence of GloVe embeddings
        sequence = sentence_to_embedding_sequence(sentence, self.glove_dict, self.embedding_dim, self.max_len)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        return sequence_tensor, label_tensor

In [7]:
class RNNClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=1, bidirectional=False):
        super(RNNClassifier, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(lstm_output_dim, output_dim)
        
    def forward(self, x):
        # x: shape (batch_size, seq_len, embedding_dim)
        output, (hn, cn) = self.lstm(x)
        # Use the last hidden state from the final layer (concatenate if bidirectional)
        if self.lstm.bidirectional:
            # Concatenate the last hidden state from forward and backward directions
            last_hidden = torch.cat((hn[-2], hn[-1]), dim=1)
        else:
            last_hidden = hn[-1]
        logits = self.fc(last_hidden)
        return logits

In [8]:
glove_path = '../../embeddings/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)
embedding_dim = 100

In [9]:
ELCo_df = pd.read_csv('../../data/ELCo.csv')
label_encoder = LabelEncoder()
ELCo_df['Composition strategy'] = label_encoder.fit_transform(ELCo_df['Composition strategy'])

In [10]:
batch_size = 32
max_len = 50
train_df, val_df, test_df = preprocess_df(ELCo_df)

train_dataset = ELCoGloveRNNDataset(train_df, glove_embeddings, embedding_dim, max_len)
val_dataset   = ELCoGloveRNNDataset(val_df, glove_embeddings, embedding_dim, max_len)
test_dataset  = ELCoGloveRNNDataset(test_df, glove_embeddings, embedding_dim, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
input_dim = embedding_dim  
hidden_dim = 128          
output_dim = len(label_encoder.classes_)  
num_layers = 5
bidirectional = True 
model = RNNClassifier(input_dim, hidden_dim, output_dim, num_layers, bidirectional)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model.to(device)

RNNClassifier(
  (lstm): LSTM(100, 128, num_layers=5, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=5, bias=True)
)

In [16]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for sequences, labels in train_loader:
        sequences = sequences.to(device)  # shape: (batch_size, max_len, embedding_dim)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    
    # Validation step
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            outputs = model(sequences)
            _, preds = torch.max(outputs, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
    val_accuracy = total_correct / total_samples
    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")


Epoch [1/20] | Train Loss: 1.2415 | Val Accuracy: 0.4528
Epoch [2/20] | Train Loss: 1.1990 | Val Accuracy: 0.4981
Epoch [3/20] | Train Loss: 1.1104 | Val Accuracy: 0.5245
Epoch [4/20] | Train Loss: 1.0711 | Val Accuracy: 0.5434
Epoch [5/20] | Train Loss: 0.9426 | Val Accuracy: 0.6038
Epoch [6/20] | Train Loss: 0.8647 | Val Accuracy: 0.5623
Epoch [7/20] | Train Loss: 0.8477 | Val Accuracy: 0.5321
Epoch [8/20] | Train Loss: 0.7598 | Val Accuracy: 0.5962
Epoch [9/20] | Train Loss: 0.7309 | Val Accuracy: 0.6075
Epoch [10/20] | Train Loss: 0.6473 | Val Accuracy: 0.6113
Epoch [11/20] | Train Loss: 0.6077 | Val Accuracy: 0.6377
Epoch [12/20] | Train Loss: 0.6157 | Val Accuracy: 0.6000
Epoch [13/20] | Train Loss: 0.6597 | Val Accuracy: 0.6340
Epoch [14/20] | Train Loss: 0.6332 | Val Accuracy: 0.6075
Epoch [15/20] | Train Loss: 0.5554 | Val Accuracy: 0.5736
Epoch [16/20] | Train Loss: 0.6355 | Val Accuracy: 0.6226
Epoch [17/20] | Train Loss: 0.5434 | Val Accuracy: 0.6189
Epoch [18/20] | Train L

In [17]:
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for embeddings, labels in test_loader:
        embeddings = embeddings.to(device)
        labels = labels.to(device)
        outputs = model(embeddings)
        _, preds = torch.max(outputs, dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)
test_accuracy = total_correct / total_samples
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.6012
