<a href="https://colab.research.google.com/github/Shinju-M/Description-Request-Matching/blob/main/BERT_Siamese_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import transformers
import warnings
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler

In [2]:
# model download
bert = transformers.AutoModel.from_pretrained('ai-forever/ruBert-large')
tokenizer = transformers.BertTokenizer.from_pretrained('ai-forever/ruBert-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [3]:
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/train.csv
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/valid.csv

--2024-04-27 08:42:28--  https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 129605 (127K) [text/plain]
Saving to: ‘train.csv’


2024-04-27 08:42:29 (8.96 MB/s) - ‘train.csv’ saved [129605/129605]

--2024-04-27 08:42:29--  https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/valid.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31200 (30K) [text/plain]
Saving to: ‘valid.csv’


2024-04-27 08:42:29 (16.0 M

In [4]:
class TrainMatchingDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.df = pd.read_csv('train.csv', sep = ';')

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        row = self.df.iloc[item]
        req = tokenizer(
            row['request'],
            add_special_tokens=True,
            padding = 'max_length',
            max_length = 512,
            return_tensors='pt'
        )

        prof = tokenizer(
            row['description'],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding = 'max_length',
            return_tensors='pt'
        )
        # перевод оценки по пятибалльной шкале в диапазон от 0 до 1
        y = row['match'] / 5
        return req, prof, torch.tensor(y, dtype=torch.float32)

In [5]:
class ValidMatchingDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.df = pd.read_csv('valid.csv', sep = ';')

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        row = self.df.iloc[item]
        req = tokenizer(
            row['request'],
            add_special_tokens=True,
            padding = 'max_length',
            max_length = 512,
            return_tensors='pt'
        )

        prof = tokenizer(
            row['description'],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding = 'max_length',
            return_tensors='pt'
        )
        # перевод оценки по пятибалльной шкале в диапазон от 0 до 1
        y = row['match'] / 5
        return req, prof, torch.tensor(y, dtype=torch.float32)

In [6]:
for param in bert.parameters():
    param.requires_grad = False

In [9]:
class SiameseNetwork(nn.Module):

  def __init__(self, bert):
    super(SiameseNetwork, self).__init__()
    self.bert = bert
    self.head = nn.Sequential(
        nn.Linear(2048, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        nn.Dropout(p=0.3),
        nn.Linear(1024, 768),
        #nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 512),

        nn.Dropout(p=0.3),
        nn.Linear(512, 64),
        #nn.BatchNorm1d(64),
        nn.ReLU(),
        #nn.Dropout(p=0.5),
        nn.Linear(64, 1),
        nn.Sigmoid(),
    )

  def forward(self, req, prof):

      req_output = self.bert(req['input_ids'].squeeze(1), req['attention_mask'].squeeze(1))
      req_mean = torch.mean(req_output[0][:, 0:, :], 1)
      #req_pooled = req_output.pooler_output

      prof_output = self.bert(prof['input_ids'].squeeze(1), prof['attention_mask'].squeeze(1))
      prof_mean = torch.mean(prof_output[0][:, 0:, :], 1)
      #prof_pooled = req_output.pooler_output

      combined = torch.cat((prof_mean.clone().detach(), req_mean.clone().detach()), dim=-1)

      x = self.head(combined)
      return x

In [10]:
train_dataset = TrainMatchingDataset()
valid_dataset = ValidMatchingDataset()

train_dataloader = DataLoader(train_dataset, batch_size=16)
valid_dataloader = DataLoader(valid_dataset, batch_size=16)

In [16]:
model = SiameseNetwork(bert)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr = 1e-4)

cross_entropy = nn.MSELoss()
epochs = 20

In [17]:
def train():
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds = []
  for step, batch in enumerate(train_dataloader):
      reqs, profs, labels = batch
      model.zero_grad()
      preds = model(reqs, profs)
      loss = cross_entropy(preds, labels.unsqueeze(1))
      total_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()

      preds = preds.detach().cpu().numpy()
      total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)
  total_preds = np.concatenate(total_preds, axis = 0)
  return avg_loss, total_preds

In [18]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0,0
    total_preds = []

    for step, batch in enumerate(valid_dataloader):
        req, prof, labels = batch

        with torch.no_grad():
            preds = model(req, prof)
            loss = cross_entropy(preds, labels.unsqueeze(1))
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(valid_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)

    return avg_loss, total_preds

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()

    if valid_loss < best_valid_loss:
      best_valid_loss = valid_loss
      torch.save(model.state_dict(), f'/content/drive/MyDrive/bert_siamese_regression.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining loss: {train_loss:.3f}')
    print(f'Validation loss: {valid_loss:.3f}')

In [20]:
path = '/content/drive/MyDrive/bert_siamese_regression.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [21]:
def match(request, profile):

    model.eval()
    req = tokenizer(
        request,
        add_special_tokens=True,
        padding = 'max_length',
        max_length = 512,
        return_tensors = 'pt'
    )

    prof = tokenizer(
        profile,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding = 'max_length',
        return_tensors = 'pt'
    )

    output = model(req, prof).item()
    return output

In [None]:
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/test.csv

In [23]:
test_df = pd.read_csv('test.csv', sep=';')
test_pairs = test_df[['request', 'description']].values.astype('str').tolist()

In [24]:
y_preds = []
for request, profile in test_pairs:
    y_preds.append(match(request, profile))

In [25]:
y_true = test_df['match'] / 5
print(y_true.tolist())

[0.2, 0.4, 0.8, 1.0, 0.4, 0.2, 0.6, 0.4, 1.0, 0.6, 1.0, 0.8]


In [26]:
y_preds

[0.6143016815185547,
 0.4447462856769562,
 0.5319055318832397,
 0.5441358089447021,
 0.4647270739078522,
 0.4861886203289032,
 0.4541949927806854,
 0.4423064887523651,
 0.4491340219974518,
 0.46288636326789856,
 0.5597274899482727,
 0.49792829155921936]

In [37]:
scores = []
request = 'Добрый вечер! Прошу помощи,к кому обращаться, ребёнка словно подменили, теперь я не умею и знаю как вести коммуникацию с собственным ребенком'
for profile in test_df['description']:
    score = match(request, profile)
    scores.append(score)

In [38]:
d = {'description': test_df['description'].tolist(), 'score': scores}
sim_scores = pd.DataFrame(d).sort_values(by='score', axis=0, ascending=False, ignore_index=True)

In [39]:
sim_scores

Unnamed: 0,description,score
0,Наиболее эффективно работаю с запросами:\n- Хр...,0.536453
1,Помогаю клиентам разобраться в сложных жизненн...,0.531906
2,Помогаю клиентам разобраться в сложных жизненн...,0.531906
3,Помогаю клиентам разобраться в сложных жизненн...,0.509827
4,Я бережный психотерапевт по отношениям в паре ...,0.504207
5,Образование: Высшее психологическое образовани...,0.50142
6,Образование: Высшее психологическое образовани...,0.498149
7,Образование: Высшее психологическое образовани...,0.498149
8,\nОпыт работы: Более 10 лет работы с клиентами...,0.474726
9,Образование: Высшее психологическое образовани...,0.47111
