In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
import logging
logging.disable(logging.WARNING)


In [3]:
final_train_df = pd.read_csv('../data/final_train_df.csv')
final_test_df = pd.read_csv('../data/final_test_df.csv')


In [4]:
train_df, val_df = train_test_split(final_train_df, test_size=0.1, random_state=42)

print(f"–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {train_df.shape}")
print(f"–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: {val_df.shape}")
print(f"–î–æ—Å—Ç—É–ø–Ω–æ GPU: {torch.cuda.device_count()}")


–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: (89964, 3)
–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: (9996, 3)
–î–æ—Å—Ç—É–ø–Ω–æ GPU: 2


In [5]:
class TextPairDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        """
        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –¥–∞—Ç–∞—Å–µ—Ç–∞.
        
        –ü–∞—Ä–∞–º–µ—Ç—Ä—ã:
        - df: pandas DataFrame —Å –∫–æ–ª–æ–Ω–∫–∞–º–∏ 'text1', 'text2', 'score'.
        - tokenizer: —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä –∏–∑ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏ Hugging Face.
        - max_length: –º–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–≥–æ –≤–≤–æ–¥–∞.
        """
        self.texts1 = df['text1'].tolist()
        self.texts2 = df['text2'].tolist()
        self.scores = df['score'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.scores)
    
    def __getitem__(self, idx):
        text1 = self.texts1[idx]
        text2 = self.texts2[idx]
        score = self.scores[idx]
        
        # –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è –ø–∞—Ä—ã —Ç–µ–∫—Å—Ç–æ–≤
        encoding = self.tokenizer(
            text1,
            text2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º —Å–ª–æ–≤–∞—Ä—å —Å —Ç–æ–∫–µ–Ω–∞–º–∏ –∏ –º–µ—Ç–∫–æ–π
        return {
            'input_ids': encoding['input_ids'].squeeze(),  # –£–±–∏—Ä–∞–µ–º –ª–∏—à–Ω—é—é —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(score, dtype=torch.float)
        }

In [6]:
# –í—ã–±–æ—Ä –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
model_name = 'bert-base-multilingual-cased'  # –ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç —Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫
tokenizer = BertTokenizer.from_pretrained(model_name)

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏ —Å –æ–¥–Ω–∏–º –≤—ã—Ö–æ–¥–æ–º (—Ä–µ–≥—Ä–µ—Å—Å–∏—è)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,  # –û–¥–∏–Ω –≤—ã—Ö–æ–¥ –¥–ª—è —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
    problem_type="regression"
)

In [7]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –º–µ—Ç—Ä–∏–∫
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    mae = mean_absolute_error(labels, predictions)
    return {
        'rmse': rmse,
        'mae': mae
    }

# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–µ–Ω–∏—è
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='rmse',
    greater_is_better=False
)


# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–æ–≤
train_dataset = TextPairDataset(train_df, tokenizer)
val_dataset = TextPairDataset(val_df, tokenizer)



In [8]:
# –°–æ–∑–¥–∞–Ω–∏–µ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # –†–∞–Ω–Ω—è—è –æ—Å—Ç–∞–Ω–æ–≤–∫–∞
)

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Rmse,Mae
1,0.0761,0.089826,0.299709,0.185356
2,0.0644,0.070381,0.265294,0.200273
3,0.0411,0.059197,0.243304,0.169772
4,0.0407,0.052689,0.229542,0.161219
5,0.0247,0.070806,0.266094,0.203038
6,0.02,0.068782,0.262264,0.210278




TrainOutput(global_step=8436, training_loss=0.14808191039858695, metrics={'train_runtime': 20280.3434, 'train_samples_per_second': 44.36, 'train_steps_per_second': 0.693, 'total_flos': 3.550546568449843e+16, 'train_loss': 0.14808191039858695, 'epoch': 6.0})

In [9]:
# –ü–æ–ª—É—á–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –æ—Ü–µ–Ω–∫–∏
metrics = trainer.evaluate()

print("–ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–µ:")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key}: {value:.4f}")



–ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–µ:
eval_loss: 0.0527
eval_rmse: 0.2295
eval_mae: 0.1612
eval_runtime: 171.7535
eval_samples_per_second: 58.2000
eval_steps_per_second: 0.9140


In [10]:
# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–∞ –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
test_dataset = TextPairDataset(final_test_df, tokenizer)

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
predictions = trainer.predict(test_dataset)

# –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã—Ö –æ—Ü–µ–Ω–æ–∫
pred_scores = predictions.predictions.squeeze()

# –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã—Ö –æ—Ü–µ–Ω–æ–∫ –≤ DataFrame
final_test_df['pred_score'] = pred_scores

# –ü—Ä–æ—Å–º–æ—Ç—Ä –ø–µ—Ä–≤—ã—Ö —Å—Ç—Ä–æ–∫
print(final_test_df.head())

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
final_test_df.to_csv('../data/final_test_with_predictions.csv', index=False)



                                               text1  \
0  –æ—Ç–≤–µ—á–∞—Ç—å —Ä–∞–±–æ—Ç–∞ –ø–∞—Ä—Ç–∏—è –≤–Ω–µ—Å—Ç–∏ —Å–≤–æ–π –≤–∫–ª–∞–¥ 1936 ...   
1  —Å–ª–æ–∂–Ω—ã–π –º–æ–¥–µ–ª—å –ø–æ–¥–¥–µ—Ä–∂–∫–∞ —à–∏—Ñ—Ä–æ–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã–µ –∞—É—Ç...   
2  —ç—Ç–æ –≤—Ä–µ–º—è –æ—Ç–ø–ª—ã—Ç—å –¥–≤–∞ –ª–æ–¥–∫–∞ –ª–æ–¥–∫–∞ —Å–∞–º –≥–µ–Ω–µ—Ä–∞–ª ...   
3  –ø–µ—Ä–≤—ã–π —é—Ä–∏–¥–∏—á–µ—Å–∫–∏ –æ–±—è–∑—ã–≤–∞—Ç—å –¥–æ–±—Ä–æ–≤–æ–ª—å–Ω—ã–π –¥–æ–∫—É–º...   
4  –¥–ª–∏–Ω–∞ –ø–µ—Ä–µ–¥–Ω–∏–π –∫—Ä—ã–ª–æ 4552 –º–º —Ä–∞–∑–º–∞—Ö –∫—Ä—ã–ª–æ 1001...   

                                               text2     score  pred_score  
0  –≤–Ω–µ—Å—Ç–∏ —Å–≤–æ–π –≤–∫–ª–∞–¥ —É—Ä–µ–≥—É–ª–∏—Ä–æ–≤–∞–Ω–∏–µ –∏–Ω—Ü–∏–¥–µ–Ω—Ç —Å–∏–∞–Ω...  4.889081    4.997534  
1  —Å–ª–æ–∂–Ω—ã–π –º–æ–¥–µ–ª—å –æ–±–ª–∞–¥–∞—Ç—å —Ñ—É–Ω–∫—Ü–∏—è —à–∏—Ñ—Ä–æ–≤–∞–Ω–∏–µ –∞—É—Ç...  4.865897    4.644709  
2  –¥–≤–∞ –ª–æ–¥–∫–∞ –æ—Ç–ø—Ä–∞–≤–∏—Ç—å—Å—è –ø–ª–∞–≤–∞–Ω–∏–µ –ª–æ–¥–∫–∞ –≥–µ–Ω–µ—Ä–∞–ª —É...  4.316391    4.484791  
3  —Ä–µ—à–µ–Ω–∏–µ 18551863 –¥–æ–±—Ä–æ–≤–æ–ª—å–

In [11]:
# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
model.save_pretrained('../model/trained_model')
tokenizer.save_pretrained('../model/trained_model')


('../model/trained_model\\tokenizer_config.json',
 '../model/trained_model\\special_tokens_map.json',
 '../model/trained_model\\vocab.txt',
 '../model/trained_model\\added_tokens.json')

In [12]:
final_test_df

Unnamed: 0,text1,text2,score,pred_score
0,–æ—Ç–≤–µ—á–∞—Ç—å —Ä–∞–±–æ—Ç–∞ –ø–∞—Ä—Ç–∏—è –≤–Ω–µ—Å—Ç–∏ —Å–≤–æ–π –≤–∫–ª–∞–¥ 1936 ...,–≤–Ω–µ—Å—Ç–∏ —Å–≤–æ–π –≤–∫–ª–∞–¥ —É—Ä–µ–≥—É–ª–∏—Ä–æ–≤–∞–Ω–∏–µ –∏–Ω—Ü–∏–¥–µ–Ω—Ç —Å–∏–∞–Ω...,4.889081,4.997534
1,—Å–ª–æ–∂–Ω—ã–π –º–æ–¥–µ–ª—å –ø–æ–¥–¥–µ—Ä–∂–∫–∞ —à–∏—Ñ—Ä–æ–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã–µ –∞—É—Ç...,—Å–ª–æ–∂–Ω—ã–π –º–æ–¥–µ–ª—å –æ–±–ª–∞–¥–∞—Ç—å —Ñ—É–Ω–∫—Ü–∏—è —à–∏—Ñ—Ä–æ–≤–∞–Ω–∏–µ –∞—É—Ç...,4.865897,4.644709
2,—ç—Ç–æ –≤—Ä–µ–º—è –æ—Ç–ø–ª—ã—Ç—å –¥–≤–∞ –ª–æ–¥–∫–∞ –ª–æ–¥–∫–∞ —Å–∞–º –≥–µ–Ω–µ—Ä–∞–ª ...,–¥–≤–∞ –ª–æ–¥–∫–∞ –æ—Ç–ø—Ä–∞–≤–∏—Ç—å—Å—è –ø–ª–∞–≤–∞–Ω–∏–µ –ª–æ–¥–∫–∞ –≥–µ–Ω–µ—Ä–∞–ª —É...,4.316391,4.484791
3,–ø–µ—Ä–≤—ã–π —é—Ä–∏–¥–∏—á–µ—Å–∫–∏ –æ–±—è–∑—ã–≤–∞—Ç—å –¥–æ–±—Ä–æ–≤–æ–ª—å–Ω—ã–π –¥–æ–∫—É–º...,—Ä–µ—à–µ–Ω–∏–µ 18551863 –¥–æ–±—Ä–æ–≤–æ–ª—å–Ω—ã–π –≤—Ö–æ–∂–¥–µ–Ω–∏–µ –∫–∏—Ä–≥–∏–∑...,4.767547,5.026204
4,–¥–ª–∏–Ω–∞ –ø–µ—Ä–µ–¥–Ω–∏–π –∫—Ä—ã–ª–æ 4552 –º–º —Ä–∞–∑–º–∞—Ö –∫—Ä—ã–ª–æ 1001...,–ø–µ—Ä–µ–¥–Ω–∏–π –∫—Ä—ã–ª–æ –∏–º–µ—Ç—å –¥–ª–∏–Ω–∞ 4552 –º–º —Ä–∞–∑–º–∞—Ö –∫—Ä—ã–ª...,4.839230,4.906640
...,...,...,...,...
19994,–±–æ–ª—Ç–æ–Ω –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞—Ç—å –∫–æ–Ω–≥—Ä–µ—Å—Å–º–µ–Ω –¥–µ–º–æ–∫—Ä–∞—Ç –≥–µ–Ω—Ä–∏...,–º–Ω–µ–Ω–∏–µ –≥–µ–Ω—Ä–∏ —É–æ–∫—Å–º—ç–Ω –±–æ–ª—Ç–æ–Ω –ø–æ–≤–ª–∏—è—Ç—å —Ä–µ—à–µ–Ω–∏–µ –±...,4.524049,4.499201
19995,–¥–æ–º 11—Å1 —Ç–∏–ø –∑–¥–∞–Ω–∏–µ —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–π —Å–æ–æ—Ä—É–∂–µ–Ω–∏–µ —ç—Ç–∞–∂...,—Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–π —Å–æ–æ—Ä—É–∂–µ–Ω–∏–µ –¥–æ–º 11—Å1 –Ω–∞—Ö–æ–¥–∏—Ç—å—Å—è –∂–∏–ª–æ...,4.702792,4.649830
19996,–ø–µ—Ä–º—å 1909 –≥–æ–¥ –∑–∞–∏–º–∫–∞ 1911 –≥–æ–¥ –ø–µ—Ä–º—å–∑–∞–∏–º–∫–∞ –∂–µ–ª...,–ø–µ—Ä–º—å —ç—Ç–æ –æ–±—ã—á–Ω—ã–π –Ω–∞–∑–≤–∞–Ω–∏–µ –∂–∏–ª–æ–π —Ä–∞–π–æ–Ω –ø–µ—Ä–º—å –∂...,4.401115,3.718822
19997,–æ–∫–æ–Ω—Ü–µ –∂–∏–∑–Ω—å –æ—Å—Ç–∞–≤–∞—Ç—å—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç –ø—É–±–ª–∏–∫–æ–≤–∞—Ç—å ...,–Ω–µ–¥–æ–ª–≥–æ –ø—Ä–æ–±—ã—Ç—å –ø–æ—Å—Ç –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç –∫–æ–º–ø–∞–Ω–∏—è –≤—ã–ø—É—Å–∫...,3.778666,3.885373


In [14]:
min_score = final_test_df['score'].min()
max_score = final_test_df['score'].max()

min_pred_score = final_test_df['pred_score'].min()
max_pred_score = final_test_df['pred_score'].max()

print(f"–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–µ score: {min_score} - {max_score}")
print(f"–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–µ pred_score: {min_pred_score} - {max_pred_score}")


–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–µ score: 0.7129944860935211 - 5.000000596046448
–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Å—Ç–æ–ª–±—Ü–µ pred_score: 1.615647554397583 - 5.049973964691162
