In [1]:
!pip install sentence_transformers
!git clone https://github.com/RusinDmitry/Identifying-implicit-relationships-in-abstract-text

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=aa7d722a3676f5a1ca53d8d24a13a5382510b116a77b3c96f96f176bb9655cc8
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

## Подготовка данных

In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import torch
from sklearn.metrics import f1_score


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('distiluse-base-multilingual-cased-v1', device = DEVICE)

data = pd.read_csv("/content/Identifying-implicit-relationships-in-abstract-text/dataset.csv")
text = data["text"]
X = model.encode(text, convert_to_tensor=False)
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

## Обучение модели

In [3]:
model = MLPClassifier(random_state=1, max_iter=300)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f1_score(y_test, y_pred, average='micro'))
print(confusion_matrix(y_test, y_pred))

0.9414208558371416
[[ 98   0   0   0   0   0   0   0]
 [  0 280   0   0   0   1   0   0]
 [  0   0 672   2   0  39   0   0]
 [  0   0   2  50   1   1   4   0]
 [  0   0   0   1  56  13   1   0]
 [  0   0  48   5   9 745   1   4]
 [  0   0   0   0   3   4  32   0]
 [  0   0   0   0   0   2   0 333]]


## Сохранение модели и проверка

In [4]:
import joblib

# Сохранение модели
joblib.dump(model, 'model.pkl')

# Загрузка модели
loaded_model = joblib.load('model.pkl')

In [5]:
y_pred = loaded_model.predict(X_train)
confusion_matrix(y_train, y_pred)

array([[ 292,    0,    0,    0,    0,    0,    0,    0],
       [   0,  843,    0,    0,    0,    0,    0,    0],
       [   0,    0, 2139,    0,    0,    0,    0,    0],
       [   0,    0,    0,  174,    0,    0,    0,    0],
       [   0,    0,    0,    0,  211,    0,    0,    0],
       [   0,    0,    0,    0,    0, 2436,    0,    0],
       [   0,    0,    0,    0,    0,    0,  119,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1007]])