<a href="https://colab.research.google.com/github/Shinju-M/Description-Request-Matching/blob/main/BERT_Siamese.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import transformers
import warnings
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/valid.csv
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/train.csv

--2024-04-24 11:23:36--  https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/valid.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31200 (30K) [text/plain]
Saving to: ‘valid.csv’


2024-04-24 11:23:37 (2.87 MB/s) - ‘valid.csv’ saved [31200/31200]

--2024-04-24 11:23:37--  https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 129605 (127K) [text/plain]
Saving to: ‘train.csv’


2024-04-24 11:23:37 (1.77 MB/

In [5]:
# model download
bert = transformers.AutoModel.from_pretrained('ai-forever/ruBert-large')
tokenizer = transformers.BertTokenizer.from_pretrained('ai-forever/ruBert-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [6]:
class TrainMatchingDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.df = pd.read_csv('/content/train.csv', sep = ';')

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        row = self.df.iloc[item]
        req = tokenizer(
            row['request'],
            add_special_tokens=True,
            padding = 'max_length',
            max_length=512,
            return_tensors='pt'
        )

        prof = tokenizer(
            row['description'],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding = 'max_length',
            return_tensors='pt'
        )
        y = row['match_bin']
        return req, prof, torch.tensor(y)

In [7]:
class ValidMatchingDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.df = pd.read_csv('/content/valid.csv', sep = ';')

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        row = self.df.iloc[item]
        req = tokenizer(
            row['request'],
            add_special_tokens=True,
            padding = 'max_length',
            max_length=512,
            return_tensors='pt'
        )

        prof = tokenizer(
            row['description'],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding = 'max_length',
            return_tensors='pt'
        )
        y = row['match_bin']
        return req, prof, torch.tensor(y)

In [10]:
for param in bert.parameters():
    param.requires_grad = False

In [11]:
class SiameseNetwork(nn.Module):
    def __init__(self, bert):
        super(SiameseNetwork, self).__init__()
        self.bert = bert
        self.head = nn.Sequential(
            # nn.Dropout(p=0.5),
            nn.Linear(2048, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),

            nn.Dropout(p=0.2),
            nn.Linear(512, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            # nn.Dropout(p=0.5),

            nn.Linear(64, 2),
            nn.Softmax(dim=1)
            # nn.Sigmoid(),
        )

    def forward(self, req, prof):

        req_output = self.bert(req['input_ids'].squeeze(1), req['attention_mask'].squeeze(1))
        req_mean = torch.mean(req_output[0][:, 0:, :], 1)

        prof_output = self.bert(prof['input_ids'].squeeze(1), prof['attention_mask'].squeeze(1))
        prof_mean = torch.mean(prof_output[0][:, 0:, :], 1)

        #combined = req_mean * prof_mean
        combined = torch.cat((req_mean.clone().detach(), prof_mean.clone().detach()), dim=-1)

        x = self.head(combined)
        return x

In [21]:
train_dataset = TrainMatchingDataset()
valid_dataset = ValidMatchingDataset()

train_dataloader = DataLoader(train_dataset, batch_size=16)
valid_dataloader = DataLoader(valid_dataset, batch_size=16)

In [12]:
model = SiameseNetwork(bert)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr = 5e-4)

cross_entropy = nn.CrossEntropyLoss()
epochs = 20

In [23]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
    for step, batch in enumerate(train_dataloader):
        reqs, profs, labels = batch
        model.zero_grad()
        preds = model(reqs, profs)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)

    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    return avg_loss, total_preds

In [24]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0,0
    total_preds = []

    for step, batch in enumerate(valid_dataloader):
        req, prof, labels = batch

        with torch.no_grad():
            preds = model(req, prof)
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(valid_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)

    return avg_loss, total_preds

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'/content/drive/MyDrive/siamese_{epoch}_bin.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining loss: {train_loss:.3f}')
    print(f'Validation loss: {valid_loss:.3f}')


 Epoch1 / 20

Training loss: 0.667
Validation loss: 0.682

 Epoch2 / 20

Training loss: 0.505
Validation loss: 0.659

 Epoch3 / 20

Training loss: 0.461
Validation loss: 0.637

 Epoch4 / 20

Training loss: 0.426
Validation loss: 0.620

 Epoch5 / 20

Training loss: 0.406
Validation loss: 0.604

 Epoch6 / 20

Training loss: 0.392
Validation loss: 0.589

 Epoch7 / 20

Training loss: 0.379
Validation loss: 0.575

 Epoch8 / 20

Training loss: 0.372
Validation loss: 0.567

 Epoch9 / 20

Training loss: 0.366
Validation loss: 0.565

 Epoch10 / 20

Training loss: 0.359
Validation loss: 0.575

 Epoch11 / 20

Training loss: 0.352
Validation loss: 0.586

 Epoch12 / 20

Training loss: 0.352
Validation loss: 0.581

 Epoch13 / 20


In [13]:
path = '/content/drive/MyDrive/siamese_8_bin.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [25]:
def match(request, profile):

    model.eval()
    req = tokenizer(
        request,
        add_special_tokens=True,
        padding = 'max_length',
        max_length = 512,
        return_tensors = 'pt'
    )

    prof = tokenizer(
        profile,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding = 'max_length',
        return_tensors = 'pt'
    )

    output = model(req, prof)
    _, prediction = torch.max(output, dim=1)
    return prediction

In [None]:
!wget https://raw.githubusercontent.com/Shinju-M/Description-Request-Matching/main/datasets/test.csv

In [16]:
test_df = pd.read_csv('/content/test.csv', sep=';')

In [26]:
request = 'Здравствуйте. Нужен семейный психолог. Тривиально... семья распадается . Но этого бы никто не хотел'
profile = test_df['description'][3]
match(request, profile)

tensor([1])

In [27]:
test_pairs = test_df[['request', 'description']].values.astype('str').tolist()

In [30]:
y_true = test_df['match_bin']
y_preds = []
for request, profile in test_pairs:
    y_preds.append(int(match(request, profile)))

In [33]:
y_true.tolist()

[0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1]

In [34]:
y_preds

[1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1]

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_preds))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.67      0.57      0.62         7

    accuracy                           0.58        12
   macro avg       0.58      0.59      0.58        12
weighted avg       0.60      0.58      0.59        12

