In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from transformers import AutoModel, AutoTokenizer, AdamW, BertTokenizer, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
import os, pickle

In [109]:
DATA_DIR = "langs"
LANG = "ary"

In [13]:
print(os.listdir(os.path.join(DATA_DIR, LANG)))

['eng_train.csv', 'eng_dev.csv']


In [104]:
from io import StringIO
d = pd.read_csv(os.path.join(DATA_DIR, "{}_train.csv".format(LANG)))
split_text = d["Text"].str.split("\n", expand=True)
print(split_text[0][0])

بعدما تخلاو على الحياة الملكية.. الأمير هاري ومراتو ميكَان غادين يصورو فلوس صحيحة\nتقارير أمريكية: تعطات فلوس صحيحة باش يتصور لقاء مع الأمير البريطاني هاري و مراتو ميكَان


In [112]:
def preprocess(l, t_path, v_path):
    data = pd.read_csv(t_path)
    test_data = pd.read_csv(v_path)

    if l in ["mar", "tel"]:
        split_text1 = data["Text"].str.strip('""').str.split('\n', expand=True)
        split_text2 = test_data["Text"].str.strip('""').str.split('\n', expand=True)
    elif l == "eng":
        split_text1 = data["Text"].str.split("\r\n", expand=True)
        split_text2 = test_data["Text"].str.strip('""').str.lower().str.split("\r\n", expand=True)
    elif l in "ary":
        split_text1 = data["Text"].str.split("\n", expand=True)
        split_text2 = test_data["Text"].str.split("\n", expand=True)
    elif l == "amh":
        split_text1 = data["Text"].str.lower().str.split("\t", expand=True)
        split_text2 = test_data["Text"].str.lower().str.split("\t", expand=True)

    data["sentence1"] = split_text1[0]
    data["sentence2"] = split_text1[1]
    test_data["sentence1"] = split_text2[0]
    test_data["sentence2"] = split_text2[1]

    data.drop(["PairID", "Text"], axis=1, inplace=True)
    test_data.drop(["PairID", "Text"], axis=1, inplace=True)
    train_data = data.sample(frac=0.8, random_state=42)
    val_data = data.drop(train_data.index)

    return train_data, val_data, test_data

all_train_data, all_val_data = [], []
all_test_data = []

langs = os.listdir(DATA_DIR)
langs.remove("hau")
langs.remove("ary")
for l in langs:
    print(l)
    lang_path = os.path.join(DATA_DIR, l)
    train_path = os.path.join(lang_path, "{}_train.csv".format(l))
    test_path = os.path.join(lang_path, "{}_dev.csv".format(l))
    t_data, v_data, tst_data = preprocess(l, train_path, test_path)
    all_train_data.append(t_data)
    all_val_data.append(v_data)
    all_test_data.append(tst_data)

train_data = pd.concat([l for l in all_train_data], axis = 0)
val_data = pd.concat([l for l in all_val_data], axis = 0)
test_data = pd.concat([l for l in all_test_data], axis = 0)
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

tel
amh
eng
mar
(7090, 3)
(1772, 3)
(775, 2)


In [113]:
langs

['tel', 'amh', 'eng', 'mar']

In [36]:
class Rdataset(Dataset):
    def __init__(self, sentence1, sentence2, scores, tokenizer, max_len):
        super(Rdataset, self).__init__()
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(sentence1)

    def __getitem__(self, idx):
        sentence1 = self.sentences1[idx]
        sentence2 = self.sentences2[idx]
        encoded = self.tokenizer.encode_plus(sentence1, sentence2, 
                                             max_length = self.max_len, 
                                             padding = "max length", 
                                             truncation = True, 
                                             return_tensors = "pt")
        input_ids = encoded["input_ids"].squeeze()
        attention_masks = encoded["attention_masks"].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'score': torch.tensor(self.scores[idx], dtype=torch.float)
        }        

In [126]:
class RModel(nn.Module):
    def __init__(self, model = "distilbert-base-nli-mean-tokens"):
        super(RModel, self).__init__()
        self.encoder = SentenceTransformer(model)
        self.fc_layers = nn.Sequential(
            nn.Linear(1536, 512),
            nn.GELU(),
            nn.Linear(512, 1),
            nn.Sigmoid()
        )

    def forward(self, sentence1, sentence2):
        s1_encoded = self.encoder.encode(sentence1, convert_to_tensor = True)
        s2_encoded = self.encoder.encode(sentence2, convert_to_tensor = True)
        input = torch.cat([s1_encoded, s2_encoded], dim = 1)
        output = self.fc_layers(input)
        return output

In [127]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RModel().to(device)
criterion = nn.MSELoss() 
optimizer = Adam(model.parameters(), lr=0.0001)
train_losses, val_losses = [],[]

best_val_loss = float('inf') 
patience_counter = 0  
patience_limit = 5 

print("Starting Training..")

num_epochs = 20
# training loop
for epoch in range(0, num_epochs):
    model.train()
    train_loss = 0
    for idx, row in train_data.iterrows():
        optimizer.zero_grad()
        output = model([row["sentence1"]], [row["sentence2"]])
        target = torch.tensor([[row["Score"]]]).to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_data)}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for idx, row in val_data.iterrows():
            output = model([row["sentence1"]], [row["sentence2"]])
            target = torch.tensor([[row["Score"]]]).to(device)
            loss = criterion(output, target)
            val_loss += loss.item()
    val_losses.append(val_loss)
    print(f"Validation Loss: {val_loss/len(val_data)}")

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0  # Reset counter
    else:
        patience_counter += 1  # Increment counter
        if patience_counter >= patience_limit:
            print("Early stopping triggered!")
            break  # Stop training

Starting Training..
Epoch 1/20, Loss: 0.042817444795366824
Validation Loss: 0.039899368550665754
Epoch 2/20, Loss: 0.033515623014383766
Validation Loss: 0.03826488039525433
Epoch 3/20, Loss: 0.0272529486111818
Validation Loss: 0.03828900569683373
Epoch 4/20, Loss: 0.022864613030572828
Validation Loss: 0.03951452195016667
Epoch 5/20, Loss: 0.019834929460541058
Validation Loss: 0.04081752873718841
Epoch 6/20, Loss: 0.018453840826506647
Validation Loss: 0.041099511974569514
Epoch 7/20, Loss: 0.018417416553874497
Validation Loss: 0.0396822307916783
Early stopping triggered!


In [118]:
# inference
rel_scores = []
model.eval()

with torch.no_grad():
    val_loss = 0
    for idx, row in val_data.iterrows():
        output = model([row["sentence1"]], [row["sentence2"]])
        target = torch.tensor([[row["Score"]]])
        output = output.cpu().numpy()[0][0]
        rel_scores.append(output)
        print(output, target.cpu().numpy()[0][0])

scores = val_data["Score"].to_list()

0.45822537 0.87
0.46210384 0.81
0.46577957 0.81
0.47312477 0.81
0.46700665 0.81
0.48226088 0.78
0.4555646 0.78
0.43009293 0.76
0.44779414 0.75
0.46583652 0.73
0.4525258 0.72
0.45224556 0.72
0.42934808 0.72
0.44035935 0.71
0.47539172 0.71
0.49120578 0.71
0.46968752 0.71
0.48192382 0.71
0.46417552 0.71
0.4784503 0.69
0.44089592 0.69
0.45303988 0.69
0.45483014 0.69
0.46001384 0.69
0.47047412 0.68
0.44643474 0.68
0.48183498 0.66
0.48368752 0.66
0.44500563 0.66
0.46340457 0.66
0.45752063 0.66
0.44893217 0.66
0.45818818 0.66
0.45939502 0.66
0.45281252 0.66
0.46161652 0.66
0.39549556 0.65
0.43729767 0.65
0.47015965 0.65
0.45982432 0.65
0.4649315 0.64
0.44968897 0.63
0.43870774 0.63
0.46087575 0.63
0.43187693 0.63
0.46178922 0.63
0.46529877 0.62
0.40437117 0.62
0.4826929 0.62
0.47719914 0.62
0.43884388 0.62
0.45321143 0.61
0.44782084 0.61
0.45695558 0.6
0.4652485 0.6
0.437495 0.6
0.469532 0.6
0.46586964 0.59
0.43881157 0.59
0.43335003 0.59
0.4456203 0.59
0.43348333 0.57
0.46218124 0.57
0.47933

In [63]:
val_data["Score"]

3       1.00
4       1.00
5       1.00
9       1.00
16      1.00
        ... 
5474    0.06
5479    0.03
5480    0.03
5488    0.03
5490    0.03
Name: Score, Length: 1100, dtype: float64

In [125]:
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

print("MSE Score: {}".format(mean_squared_error(np.array(scores), np.array(rel_scores))))
print("Rank correlation: {}".format(spearmanr(scores, rel_scores)))

MSE Score: 0.04328112315870795
Rank correlation: SignificanceResult(statistic=0.3191117066008746, pvalue=3.112791155009685e-43)
