# Библиотеки и считывание датасета

**Библиотеки**

In [None]:
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

In [None]:
try:
    import easyocr
except:
    print("[INFO] Couldn't find faiss... installing it.")
    !pip install faiss-gpu
    import faiss

In [None]:
try:
    import easyocr
except:
    print("[INFO] Couldn't find eacsyocr... installing it.")
    ! pip install easyocr
    import easyocr

In [None]:
# Заacorrгружаем нужные библиотеки
import os
from torch.utils.data import Dataset, DataLoader
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import torchvision.transforms as transforms
import albumentations as A
from albumentations.pytorch import ToTensorV2
import copy
from torchvision.transforms import functional as F
import numpy as np
from sklearn.model_selection import train_test_split
import torchvision.models as models
import torch
import torch.nn as nn
from collections import defaultdict
from PIL import Image
import easyocr
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.tokenize import WordPunctTokenizer
import fasttext
from gensim.models import Word2Vec
import gensim.downloader as api
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import catboost
from sklearn.metrics.pairwise import cosine_similarity
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torchvision.models import vit_b_16, ViT_B_16_Weights
from sklearn.metrics import roc_auc_score
from torchvision import models
import pandas as pd
from sklearn.model_selection import GridSearchCV
from transformers import BertModel, BertTokenizer

In [None]:
root_directory = r'/kaggle/input/wb-contest-trust-safety/train'

In [None]:
directory0 = os.path.join(root_directory, "0")
directory1 = os.path.join(root_directory, "1")
images0_filepaths = sorted([os.path.join(directory0, f) for f in os.listdir(directory0)])
images1_filepaths = sorted([os.path.join(directory1, f) for f in os.listdir(directory1)])

In [None]:
# images0_filepaths = [i for i in tqdm(images0_filepaths) if cv2.imread(i) is not None]
# images1_filepaths = [i for i in tqdm(images1_filepaths) if cv2.imread(i) is not None]

In [None]:
# len(images0_filepaths), len(images1_filepaths)

# Нахождение аугментаций

**Сменю тактику, теперь для определения аугментаций воспользуемся эмбеддингами MobileNet**

In [None]:
# Класс для считывания изображений и применения трансформаций
class FraudDataset(Dataset):
    def __init__(self, images_filepaths, transform=None):
        self.images_filepaths = images_filepaths
        self.transform = transform

    def __len__(self):
        return len(self.images_filepaths)

    def __getitem__(self, idx):
        image_filepath = self.images_filepaths[idx]
        image = cv2.imread(image_filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if os.path.normpath(image_filepath).split(os.sep)[-2] == '1':
            label = 1.0
        else:
            label = 0.0
        if self.transform is not None:
            image = self.transform(image=image)["image"]
        return image, label

In [None]:
# Создадим наш датасет
transform = A.Compose(
    [
        A.SmallestMaxSize(max_size=300),
        A.CenterCrop(height=224, width=224),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ]
)


spam_dataset = FraudDataset(images1_filepaths, transform)

In [None]:
# Подключимся к GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
# Получим эмбеддинги с мобилнет
cnn_model = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1).eval().to(device)
cnn_model.classifier = nn.Identity()

In [None]:
# Data Loader
spam_loader = DataLoader(spam_dataset, num_workers=4, shuffle=False, batch_size=1)
len(spam_loader)

In [None]:
# Получение эмбеддингов
image_embeds = []

with torch.no_grad():
    for images in tqdm(spam_loader):
        embeds = cnn_model(images[0].to(device))
        image_embeds += [embeds.squeeze().cpu()]

image_embeds = np.array(image_embeds)

In [None]:
def find_augmentations_new(image_embeds, threshold=120, k_neighbours=5000):
    image_neighbours_dct = {}

    # Создание Faiss индекса
    index = faiss.IndexFlatL2(image_embeds.shape[1])  # Используем IndexFlatL2 для L2-нормализованных эмбеддингов

    # Добавление эмбеддингов в индекс
    index.add(image_embeds)

    # Поиск ближайших соседей для каждого изображения
    D, I = index.search(image_embeds, k=k_neighbours)  # Ищем ближайших соседа

    for i in tqdm(range(len(image_embeds))):
        for j in range(1, len(D[i])):
            if D[i][j] < threshold:
                if image_neighbours_dct.get(i) is not None:
                    image_neighbours_dct[i].append(I[i][j])
                else:
                    image_neighbours_dct[i] = [I[i][j]]
            else:
                break

    return image_neighbours_dct

In [None]:
# Для визуальной проверки
def images_matching(index1, index2):
    image1 = cv2.imread(images1_filepaths[index1], 0)
    image2 = cv2.imread(images1_filepaths[index2], 0)

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))

    ax[0].imshow(image1)
    ax[1].imshow(image2)
    plt.show()

In [None]:
augmentations_dct = find_augmentations_new(image_embeds)

In [None]:
augmentations_np = np.array([*zip(augmentations_dct.keys(), *augmentations_dct.values())][0])
augmentations_np = np.unique(augmentations_np)
np.savetxt('augmentations4.txt', augmentations_np)
len(augmentations_np)

# Разделение на трейн и вал

**Скачаем аугментации**

In [None]:
augmentations_np = np.loadtxt('/kaggle/input/pathes-and-reads/augmentations4.txt').astype(int)
len(augmentations_np)

**Разделим сначала спамовые, трейн с аугментациями, вал без**

In [None]:
images0_filepaths_np = np.array(images0_filepaths)
images1_filepaths_np = np.array(images1_filepaths)

len_sample = len(images0_filepaths_np)

# Отправим в трейн часть аугментаций
augmentations_paths = images1_filepaths_np[augmentations_np]
train_images_paths = np.random.choice(augmentations_paths, size = round(0.4 * len_sample), replace=False)

# Оставим массив без аугментаций
unique_paths = np.setdiff1d(images1_filepaths_np, augmentations_paths)

# Дополним не аугментациями
sample1 = np.random.choice(unique_paths, size = round(0.4 * len_sample), replace=False)
train_images_paths = np.concatenate((train_images_paths, sample1))

# Вал
sample2 = np.setdiff1d(unique_paths, sample1, assume_unique=True)
val_images_paths = np.random.choice(sample2, size = round(0.2 * len_sample), replace=False)

In [None]:
len(train_images_paths), len(val_images_paths)

In [None]:
# Проверим сохранилась ли длина и нет ли пересечений между ссылками
print(f'Количество пересечений train и val = {len(np.intersect1d(train_images_paths, val_images_paths))}')
print(f'Сопвадает ли количество: {(len(train_images_paths) + len(val_images_paths)) == len_sample}')

**Добавим не спам изображения**

In [None]:
# Добавляем не спам изображения
sample_1 = np.random.choice(images0_filepaths_np, size=round(0.8 * len(images0_filepaths_np)), replace=False)
sample_2 = np.setdiff1d(images0_filepaths_np, sample_1, assume_unique=True)
train_images_paths = np.concatenate((train_images_paths, sample_1))
val_images_paths = np.concatenate((val_images_paths, sample_2))

In [None]:
# Проверим нет ли пересечений между ссылками у train и val
print(f'Количество пересечений train и val = {len(np.intersect1d(train_images_paths, val_images_paths))}')
print(f'Длины: train - {len(train_images_paths)}, val - {len(val_images_paths)}')

In [None]:
# Перемешаем данные
random.seed(42)
random.shuffle(train_images_paths)
random.shuffle(val_images_paths)

# CNN-модель для дальнейших эмбеддингов

**Обучим ViT, чтобы потом получить эмбеддинги**

In [None]:
lr = 0.001
batch_size = 64

In [None]:
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)

In [None]:
# Переопределим количество классов
num_ftrs = model.heads.head.in_features
model.heads.head = torch.nn.Linear(num_ftrs, 1)

# Заморозка слоев
for param in model.parameters():
    param.requires_grad = False

# Разморозка предпоследнего и последнего конволюц слоя, а также FC
for param in model.encoder.layers.encoder_layer_11.parameters():
    param.requires_grad = True

for param in model.encoder.layers.encoder_layer_10.parameters():
    param.requires_grad = True

for param in model.encoder.ln.parameters():
    param.requires_grad = True

for param in model.heads.head.parameters():
    param.requires_grad = True

In [None]:
summary(model,
        input_size=(32, 3, 224, 224), # make sure this is "input_size", not "input_shape" (batch_size, color_channels, height, width)
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

In [None]:
model = model.to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# Датасеты
train_dataset = FraudDataset(train_images_paths, transform)
val_dataset = FraudDataset(val_images_paths, transform)

In [None]:
# Даталоадеры (датасеты с трансформациями)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Обучение
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    stream = tqdm(train_loader)
    loss_sum = 0
    probs_all = []
    targets_all = []

    for i, (images, target) in enumerate(stream, start=1):
        images = images.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True).float().view(-1, 1)
        output = model(images)

        loss = criterion(output, target)

        probs = torch.sigmoid(output).detach().cpu()
        probs_all.append(probs.squeeze())
        targets_all.append(target.detach().cpu().squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_sum += loss.item()
        stream.set_description(f'Epoch: {epoch}, Train: loss {round(loss_sum / i, 4)}')

    roc = roc_auc_score(torch.cat(targets_all, dim=-1), torch.cat(probs_all, dim=-1))
    print('Roc-Auc:',roc)

    return loss_sum / len(train_loader), roc

In [None]:
# Расчет метрик
@torch.inference_mode()
def validate(val_loader, model, criterion, epoch, best_roc=0):
    model.eval()
    stream = tqdm(val_loader)
    loss_sum = 0
    probs_all = []
    targets_all = []
    for i, (images, target) in enumerate(stream, start=1):
        images = images.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True).float().view(-1, 1)
        output = model(images)

        loss = criterion(output, target)

        probs = torch.sigmoid(output).detach().cpu()
        probs_all.append(probs.squeeze())
        targets_all.append(target.detach().cpu().squeeze())

        loss_sum += loss.item()
        stream.set_description(f'Epoch: {epoch}, Validate: loss {round(loss_sum / i, 4)}')

    roc = roc_auc_score(torch.cat(targets_all, dim=-1), torch.cat(probs_all, dim=-1))
    print('Roc-Auc:',roc)

    return loss_sum / len(val_loader), roc

In [None]:
# Обучение
loss_train_vit = []
roc_train_vit = []
loss_val_vit = []
roc_val_vit = []
best_roc = 0

for epoch in range(1, 13):
        l_t_v, r_t_v = train(train_loader, model, criterion, optimizer, epoch)
        loss_train_vit.append(l_t_v)
        roc_train_vit.append(r_t_v)

        l_v_v, r_v_v = validate(val_loader, model, criterion, epoch)
        loss_val_vit.append(l_v_v)
        roc_val_vit.append(r_v_v)

        if r_v_v > best_roc:
            torch.save(model.state_dict(), 'Vit_weights.pth')
            best_roc = r_v_v

In [None]:
model.load_state_dict(torch.load(f'/kaggle/input/pathes-and-reads/Vit_weights.pth'))

# Эмбеддинги изображений

In [None]:
model.heads = nn.Identity()

In [None]:
# Почему-то отвалился tqdm
from tqdm import tqdm

def get_vit_embeds(model, loader):
    vit_embeds = []
    with torch.no_grad():
        for images in tqdm(loader):
            embeds = model(images[0].to(device))
            vit_embeds += [embeds.squeeze().cpu()]

    return np.array(vit_embeds)

In [None]:
# Считывание путей
def paths_scanner(filename):

    pathes = []

    with open(filename, "r") as inf:
        for line in inf:
            pathes.append(line.strip())

    pathes_lcl = [x[44:] for x in pathes]
    pathes = [os.path.join(root_directory, x) for x in pathes_lcl]

    return pathes

In [None]:
pathes_train = paths_scanner(r'/kaggle/input/pathes-and-reads/train_pathes-3.txt')
pathes_val = paths_scanner(r'/kaggle/input/pathes-and-reads/val_pathes-4.txt')

In [None]:
# Датасеты
train_dataset = FraudDataset(pathes_train, transform)
val_dataset = FraudDataset(pathes_val, transform)

In [None]:
# Даталоадеры c батчсайз 1 и неперемешанные
vit_train_loader = DataLoader(
    train_dataset, batch_size=1, shuffle=False)
vit_val_loader = DataLoader(
    val_dataset, batch_size=1, shuffle=False)

In [None]:
# Получение эмбеддингов
vit_train_embeds = get_vit_embeds(model, vit_train_loader)

In [None]:
# Получение эмбеддингов
vit_val_embeds = get_vit_embeds(model, vit_val_loader)

In [None]:
# Сохранения эмбеддингов
np.save('vit_train_embeds.npy', vit_train_embeds)
np.save('vit_val_embeds.npy', vit_val_embeds)

In [None]:
# Скачивание эмбеддингов
vit_train_embeds = np.load(r'/kaggle/input/pathes-and-reads/vit_train_embeds.npy')
vit_val_embeds = np.load(r'/kaggle/input/pathes-and-reads/vit_val_embeds.npy')

# Считывание слов с изображений

**Пропишем функции считывания слов с изображений, запись их, запись путей и чтение этих данных с файлов**

In [None]:
# Запись в файлы слов с изображения
def words_read_write(paths, reader, filename):
    wrds_lst = []

    for image_path in tqdm(paths):
        wrds_lst_lcl = reader.readtext(str(image_path), detail=0)
        wrds_lst.append(wrds_lst_lcl)

    with open(filename, "w") as file:
        for item in wrds_lst:
            file.write(str(item) + "\n")

In [None]:
# Считаем слова, которые нашел easyocr на каждой картинке
def read_scanner(filename):
    wrds_list = []

    with open(filename, "r") as inf:
        for line in inf:
            wrds_list.append(line.strip().lower())

    wrds_list = [eval(item) for item in wrds_list]

    return wrds_list

In [None]:
# Так как пути каждый раз перемешиваются, их запись
def paths_write(filename, paths):
    with open(filename, "w") as file:
        for item in paths:
            file.write(str(item) + "\n")

In [None]:
# Функция для построения изображений и текста к нему
def image_and_text(words_list, path_list, indexes):
    for i in indexes:
        image = cv2.imread(path_list[i], 0)
        plt.imshow(image)
        plt.show()
        print(words_list[i])

In [None]:
reader = easyocr.Reader(['ru', 'en'])

In [None]:
words_read_write(val_images_paths, reader, r'val_reads.txt')
paths_write(r'val_pathes.txt', val_images_paths)
wrds_val = read_scanner(r'/kaggle/input/pathes-and-reads/val_reads-4.txt')
pathes_val = paths_scanner(r'val_pathes.txt')

In [None]:
words_read_write(train_images_paths, reader, r'train_reads.txt')
paths_write(r'train_pathes.txt', train_images_paths)
wrds_train = read_scanner(r'/kaggle/input/pathes-and-reads/train_reads-3.txt')
pathes_train = paths_scanner(r'train_pathes.txt')

In [None]:
image_and_text(wrds_train, pathes_train, random.sample(range(len(wrds_train)), 5))

# Эмбеддинги текстов

In [None]:
# Функция получения лейблов
def get_labels(paths):
    labels = np.zeros(len(paths))

    for i, path in enumerate(paths):
        if os.path.normpath(path).split(os.sep)[-2] == '1':
            labels[i] = 1.0

    return labels

In [None]:
# Токанайзер и как он работает
tokenizer = WordPunctTokenizer()
print(' '.join(wrds_val[214]), '\n')
print(tokenizer.tokenize(' '.join(wrds_val[214])))

In [None]:
# Получем токеннизированные тексты
tok_val = [tokenizer.tokenize(' '.join(data_entry)) for data_entry in wrds_val]
tok_train = [tokenizer.tokenize(' '.join(data_entry)) for data_entry in wrds_train]

In [None]:
# Word2Vec
# Обучаем модели
word2vec = Word2Vec(tok_train,
                 vector_size=32,      # embedding vector size
                 min_count=5,  # consider words that occured at least 5 times
                 window=5).wv  # define context as a 5-word window around the target word

In [None]:
word2vec.most_similar('отзыв')

In [None]:
# Эмбеддинг для текста
def get_phrase_embedding(tokens, model):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros

    vector = np.zeros([model.vector_size], dtype='float32')
    dct_count = Counter(tokens)

    tokens = [word for word in tokens if word in model]
    dct_tok  = {token: model.get_vector(token) for token in tokens}

    for tok in dct_tok:
        vector += dct_tok[tok] * dct_count[tok] / len(tokens)

    return vector

In [None]:
# Получим эмбеддинги для всех выборок
vectors_train = np.array([get_phrase_embedding(phrase, word2vec) for phrase in tok_train])
vectors_val = np.array([get_phrase_embedding(phrase, word2vec) for phrase in tok_val])

In [None]:
# Получим таргеты
y_train = get_labels(pathes_train)
y_val = get_labels(pathes_val)

# Эмбеддинги из BERT

In [None]:
# Загрузка предобученной модели BERT и токенизатора
model_name = 'bert-base-uncased'
model_bert = BertModel.from_pretrained(model_name)
tokenizer_bert = BertTokenizer.from_pretrained(model_name)

In [None]:
def bert_embeddings(words_lst):
    bert_embeds = []
    for wrds in tqdm(words_lst):
        if wrds:
            text = ''.join(wrds)
            tokens = tokenizer_bert.tokenize(text)
            input_ids = tokenizer_bert.convert_tokens_to_ids(tokens)
            input_ids = torch.tensor([input_ids])

        # Получение эмбеддингов
            with torch.no_grad():
                outputs = model_bert(input_ids)
                bert_embeds.append(outputs[0].flatten())
        else

    return bert_embeds

In [None]:
def bert_embeddings(words_lst):
    bert_embeds = []

    for wrds in tqdm(words_lst):
        phrase = ' '.join(wrds)
        inputs = tokenizer_bert.encode_plus(phrase, add_special_tokens=True,
                                            return_tensors='pt',  truncation=True,
                                            max_length=512)

        with torch.no_grad():
            outputs = model_bert(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].squeeze()
        bert_embeds.append(embeddings)

    return bert_embeds

In [None]:
bert_embeds_val = bert_embeddings(wrds_val)

In [None]:
bert_embeds_val = np.array(bert_embeds_val)

In [None]:
np.save('bert_embeds_val.npy', bert_embeds_val)
bert_embeds_val = np.load(r'/kaggle/input/pathes-and-reads/bert_embeds_val.npy')

# Композиция моделей. Градиентный бустинг

**Посмотрим на бустинг с эмбедингами текстов + изображений**

In [None]:
# Функция для объединения эмбеддингов
def concate_embeds(embeds_vit, embeds_txts):
    embeds_full = []

    for vit, txt in zip(embeds_vit, embeds_txts):
        embeds_full.append(np.concatenate((vit, txt)))

    return np.array(embeds_full)

In [None]:
embeds_full_train = concate_embeds(vit_train_embeds, vectors_train)
embeds_full_val = concate_embeds(vit_val_embeds, vectors_val)

In [None]:
boosting_full = catboost.CatBoostClassifier(verbose=False)
boosting_full.fit(embeds_full_train, y_train, verbose=False)
Y_pred = boosting_full.predict_proba(embeds_full_val)[:, 1]
roc_auc_score(y_val, Y_pred)

**Тьюнинг бустинга**

**Обьединим трейн и вал**

In [None]:
embeds_full = np.concatenate((embeds_full_train, embeds_full_val), axis = 0)
y_full = np.concatenate((y_train, y_val), axis = 0)

In [None]:
depths = [4, 6, 8]
learning_rates = [0.01, 0.03, 0.05]
iterationss = [1500, 1250, 1000, 750]
best_roc = 0
cntr = 0
best_params = []

for lr in tqdm(learning_rates):
    for i in iterationss:
        for d in depths:
            boosting = catboost.CatBoostClassifier(depth = d, learning_rate = lr, iterations = i, verbose=False)
            boosting.fit(embeds_full_val, y_val, verbose=False)

            Y_pred = boosting.predict_proba(embeds_full_val)[:, 1]
            roc = roc_auc_score(y_val, Y_pred)
            print(cntr, roc)
            cntr += 1

            if roc > best_roc:
                best_roc = roc
                best_params = [i, lr, d]

In [None]:
print(best_params)

In [None]:
# Обучим с лучшими параметрами
boosting_best = catboost.CatBoostClassifier(depth = 8, learning_rate = 0.03, iterations = 1500, verbose=False)
boosting_best.fit(embeds_full_val, y_val, verbose=False)

**ViT + BERT**

In [None]:
embeds_transformers_val = concate_embeds(vit_val_embeds, bert_embeds_val)

In [None]:
depths = [4, 6, 8]
learning_rates = [0.01, 0.03, 0.05]
iterationss = [1500, 1250, 1000, 750]
best_roc = 0
cntr = 0
best_params = []

for lr in tqdm(learning_rates):
    for i in iterationss:
        for d in depths:
            boosting = catboost.CatBoostClassifier(depth = d, learning_rate = lr, iterations = i, verbose=False)
            boosting.fit(embeds_transformers_val, y_val, verbose=False)

            Y_pred = boosting.predict_proba(embeds_transformers_val)[:, 1]
            roc = roc_auc_score(y_val, Y_pred)
            print(cntr, roc)
            cntr += 1

            if roc > best_roc:
                best_roc = roc
                best_params = [i, lr, d]

  0%|          | 0/3 [00:00<?, ?it/s]

0 0.9914863461038899
1 0.9968225856386969
2 0.9996885193190685
3 0.9892768509841647
4 0.9953890348938812
5 0.9991792247251691
6 0.9867109145270021
7 0.9930334338600845
8 0.9978993706546142
9 0.9842157602866478
10 0.9899192758336925


 33%|███▎      | 1/3 [1:11:26<2:22:53, 4286.92s/it]

11 0.9954502800017335
12 0.9996244195663587
13 0.9999983131644024
14 0.9999999351217078
15 0.999050052045366
16 0.9999920848483496
17 0.9999998702434155
18 0.9978910013549183
19 0.9999213675098335
20 0.99999941609537
21 0.9957418430469545
22 0.9993558883148745


 67%|██████▋   | 2/3 [2:22:33<1:11:15, 4275.09s/it]

23 0.9999900087429987
24 0.9999957180327137
25 0.9999999351217078
26 0.9999999351217078
27 0.9999687286631516
28 0.9999999351217078
29 0.9999999351217078
30 0.9998029646265385
31 0.9999995458519545
32 0.9999999351217078
33 0.9990039235796001
34 0.9999913711871352


100%|██████████| 3/3 [3:33:34<00:00, 4271.65s/it]

35 0.9999998702434156





In [None]:
print(best_params)

[1500, 0.03, 8]


In [None]:
boosting_trans = catboost.CatBoostClassifier(depth = best_params[2],
                                             learning_rate = best_params[1],
                                             iterations = best_params[0],
                                             verbose=False)
boosting_trans.fit(embeds_transformers_val, y_val, verbose=False)

<catboost.core.CatBoostClassifier at 0x7d4d93736ad0>

# Submission

In [None]:
root_test = r'/kaggle/input/wb-contest-trust-safety/test'
test_filepaths = sorted([os.path.join(root_test, f) for f in os.listdir(root_test)])

In [None]:
test_images_paths = [i for i in tqdm(test_filepaths) if cv2.imread(i) is not None]

100%|██████████| 3232/3232 [01:04<00:00, 50.05it/s]


In [None]:
words_read_write(test_images_paths, reader, r'test_reads.txt')
wrds_test = read_scanner(r'/kaggle/input/pathes-and-reads/test_reads.txt')

In [None]:
test_ids = [i[43:-4] for i in test_filepaths]

In [None]:
tok_test = [tokenizer.tokenize(' '.join(data_entry)) for data_entry in wrds_test]
vectors_test = np.array([get_phrase_embedding(phrase, word2vec) for phrase in tok_test])

In [None]:
bert_embeds_test = np.array(bert_embeddings(wrds_test))

In [None]:
np.save('bert_embeds_test.npy', bert_embeds_test)
bert_embeds_test = np.load(r'/kaggle/input/pathes-and-reads/bert_embeds_test.npy')

In [None]:
test_dataset = FraudDataset(test_images_paths, transform)
vit_test_loader = DataLoader(
    test_dataset, batch_size=1, shuffle=False)

In [None]:
Получение эмбеддингов
vit_test_embeds = get_vit_embeds(model, vit_test_loader)

In [None]:
np.save('vit_test_embeds.npy', vit_test_embeds)
vit_test_embeds = np.load(r'/kaggle/input/pathes-and-reads/vit_test_embeds.npy')

In [None]:
embeds_full_test = concate_embeds(vit_test_embeds, vectors_test)

In [None]:
embeds_transformers_test = concate_embeds(vit_test_embeds, bert_embeds_test)

In [None]:
y_test_pred = fc_classifier(torch.tensor(embeds_full_test).to(device)).squeeze()

In [None]:
y_test_pred = np.array(y_test_pred.detach().cpu())

In [None]:
y_test_pred = boosting_trans.predict_proba(embeds_transformers_test)[:, 1]

In [None]:
df = pd.DataFrame({'ID' : test_ids, 'target': y_test_pred})
df.head()

Unnamed: 0,ID,target
0,121323222,0.007206
1,121388710,0.002873
2,121434035,0.001595
3,121474003,0.005757
4,121488879,0.004987


In [None]:
df.to_csv('submission_wb.csv', sep = ',', index = False)