In [5]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

SEED = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
df.head()

Unnamed: 0,model_annotation,human_markup,audio_path,label
0,давай по россии значит на коленях быстро блять...,давай проси значит на коленях быстро блять,s3://ap-training-set/xacaton_openstt/part_1/05...,1
1,ну разве можно так с телефоном поступает,ну что ну разве можно так с телефоном поступать,s3://ap-training-set/xacaton_openstt/part_1/05...,0
2,у меня нет с собой в полном адресе я щас дома ...,у меня нет с собой полного адреса я щас из дом...,s3://ap-training-set/xacaton_openstt/part_1/05...,0
3,а я здесь кто я санитар,а я знаешь кто я санитар,s3://ap-training-set/xacaton_openstt/part_1/05...,0
4,дежурный по кузьминскому военнокомату,дежурный по кузьминскому военкомату,s3://ap-training-set/xacaton_openstt/part_1/05...,0


### Сбер embeddings
validation AUC=0.767

In [4]:
BASE_MODEL = 'ai-forever/sbert_large_nlu_ru'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [6]:
# 1 minute 10 secs with GPU
texts = df['model_annotation'].to_list()
embeddings = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])
embeddings.shape

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(6508, 1024)

In [15]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 1024)
(1302, 1024)


best params (validation AUC=0.767):
- iterations=1000 
- learning_rate=0.05 
- loss_function='Logloss'
- depth=4
- l2_leaf_reg=10

In [48]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f54cd8aa3d0>

### multilingual e5
validation AUC 0.73

In [49]:
BASE_MODEL = 'intfloat/multilingual-e5-large'

In [50]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [51]:
# 1 minute 26 secs with GPU
texts = df['model_annotation'].to_list()
embeddings = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])
embeddings.shape

(6508, 1024)

In [52]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 1024)
(1302, 1024)


same params (validation AUC 0.73)

In [53]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f54f8ce9690>

### LaBSE
validation AUC 0.7

In [54]:
BASE_MODEL = 'sentence-transformers/LaBSE'

In [55]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [56]:
# 37 secs with GPU
texts = df['model_annotation'].to_list()
embeddings = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])
embeddings.shape

(6508, 768)

In [57]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 768)
(1302, 768)


same params (validation AUC 0.73)

In [58]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f548c5d9690>

### rubert tiny 2
validation AUC 0.7

In [59]:
BASE_MODEL = 'cointegrated/rubert-tiny2'

In [60]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()



config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [61]:
# 15 secs with GPU
texts = df['model_annotation'].to_list()
embeddings = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])
embeddings.shape

(6508, 312)

In [62]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 312)
(1302, 312)


same params (validation AUC 0.7)

In [63]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f54c4c03a90>

### DeepPavlov
validation AUC 0.7

In [64]:
BASE_MODEL = 'DeepPavlov/rubert-base-cased-sentence'

In [65]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [66]:
# 38 secs with GPU
texts = df['model_annotation'].to_list()
embeddings = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])
embeddings.shape

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(6508, 768)

In [67]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 768)
(1302, 768)


same params (validation AUC 0.7)

In [73]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f54c419d010>

### Facebook fasttext
validation AUC 0.7

https://fasttext.cc/docs/en/crawl-vectors.html

In [1]:
import fasttext

In [3]:
# аккуратно, съест минимум 8 ГБ оперативки
ft = fasttext.load_model('models/cc.ru.300.bin')



In [7]:
texts = df['model_annotation'].to_list()
embeddings = np.array([ft.get_sentence_vector(text) for text in texts])
embeddings.shape

(6508, 300)

In [8]:
labels = df['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=SEED)
train_data = Pool(data=X_train, label=y_train)
test_data = Pool(data=X_test, label=y_test)
print(train_data.shape)
print(test_data.shape)

(5206, 300)
(1302, 300)


In [10]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=15, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f16834f1a90>

### TO DO
- tune hyperparameters for each model
- fine-tune embedding models with given text
- replace catboost with NNs