In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
def preprocess_text(text_series):
    return (text_series
            .str.lower()
            .replace(r'[^a-z0-9\s]', ' ', regex=True)
            .str.strip()
            .str.split()
            .apply(lambda tokens: ' '.join(tokens)))

def preprocess_df(df):
    df = df.drop(columns=['EM'])
    df['Description'] = preprocess_text(df['Description'])
    df['EN'] = preprocess_text(df['EN'])
    df['combined_text'] = df['Description'] + ' ' + df['EN']
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['Composition strategy']
    )
    train_df, val_df = train_test_split(
        train_df, test_size=0.2, random_state=42, stratify=train_df['Composition strategy']
    )
    return train_df, val_df, test_df


In [3]:
class ELCoTFIDFDataset(Dataset):
    def __init__(self, X, y):
        # Convert the sparse matrix to a dense tensor
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [4]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

In [9]:
ELCo_df = pd.read_csv('../../data/ELCo.csv')
label_encoder = LabelEncoder()
vectorizer = TfidfVectorizer(max_features=10000)
ELCo_df['Composition strategy'] = label_encoder.fit_transform(ELCo_df['Composition strategy'])
train_df, val_df, test_df = preprocess_df(ELCo_df)
X_train = vectorizer.fit_transform(train_df['combined_text'])
X_val   = vectorizer.transform(val_df['combined_text'])
X_test  = vectorizer.transform(test_df['combined_text'])
y_train = train_df['Composition strategy'].values
y_val   = val_df['Composition strategy'].values
y_test  = test_df['Composition strategy'].values

In [10]:
train_dataset = ELCoTFIDFDataset(X_train, y_train)
val_dataset   = ELCoTFIDFDataset(X_val, y_val)
test_dataset  = ELCoTFIDFDataset(X_test, y_test)

In [11]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
input_dim = X_train.shape[1]               
hidden_dim = 128                         
output_dim = len(label_encoder.classes_)

model = MLPClassifier(input_dim, hidden_dim, output_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [19]:
num_epochs = 100 

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch = X_val_batch.to(device)
            y_val_batch = y_val_batch.to(device)
            
            outputs = model(X_val_batch)
            _, preds = torch.max(outputs, dim=1)
            total_correct += (preds == y_val_batch).sum().item()
            total_samples += y_val_batch.size(0)
    
    val_accuracy = total_correct / total_samples
    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | Val Acc: {val_accuracy:.4f}")


Epoch [1/100] | Train Loss: 0.3104 | Val Acc: 0.5698
Epoch [2/100] | Train Loss: 0.3068 | Val Acc: 0.5811
Epoch [3/100] | Train Loss: 0.2723 | Val Acc: 0.5811
Epoch [4/100] | Train Loss: 0.2460 | Val Acc: 0.5849
Epoch [5/100] | Train Loss: 0.2332 | Val Acc: 0.5736
Epoch [6/100] | Train Loss: 0.2215 | Val Acc: 0.5849
Epoch [7/100] | Train Loss: 0.2104 | Val Acc: 0.5811
Epoch [8/100] | Train Loss: 0.1908 | Val Acc: 0.5849
Epoch [9/100] | Train Loss: 0.1760 | Val Acc: 0.5811
Epoch [10/100] | Train Loss: 0.1684 | Val Acc: 0.5849
Epoch [11/100] | Train Loss: 0.1687 | Val Acc: 0.5811
Epoch [12/100] | Train Loss: 0.1486 | Val Acc: 0.5774
Epoch [13/100] | Train Loss: 0.1423 | Val Acc: 0.5925
Epoch [14/100] | Train Loss: 0.1347 | Val Acc: 0.5849
Epoch [15/100] | Train Loss: 0.1238 | Val Acc: 0.5811
Epoch [16/100] | Train Loss: 0.1183 | Val Acc: 0.5774
Epoch [17/100] | Train Loss: 0.1103 | Val Acc: 0.5849
Epoch [18/100] | Train Loss: 0.1051 | Val Acc: 0.5774
Epoch [19/100] | Train Loss: 0.1013 |

In [20]:
model.eval()
test_correct = 0
test_samples = 0

with torch.no_grad():
    for X_test_batch, y_test_batch in test_loader:
        X_test_batch = X_test_batch.to(device)
        y_test_batch = y_test_batch.to(device)
        
        outputs = model(X_test_batch)
        _, preds = torch.max(outputs, dim=1)
        test_correct += (preds == y_test_batch).sum().item()
        test_samples += y_test_batch.size(0)

test_accuracy = test_correct / test_samples
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.6042
