# Task undastanding

Задумка в том, чтобы конвертировать картинку в текст, с последующим получением текстовых эмбеддингов. На русский язык предобученных моделей image to text толковых не увидел, поэтому использую конвертацию в английский текст. Соответственно для получения эмбеддингов уже использую BERT-ы с качественной токенизацией английского.

# Library

In [None]:
%%capture
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install sentencepiece==0.1.97

In [None]:
import re
import os
import shutil
import random
import time
import zipfile
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torchvision import datasets, transforms
from torch.nn import functional as F

# Helper function

In [None]:
def _get_text_embed(text : str, model, tokenizer, max_length : int) -> np.array:
    """
    https://habr.com/ru/post/562064/
    """
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
def get_df_text_emb(df: pd.DataFrame, col : str, model, 
                    tokenizer, prefix_column_name : str,
                    max_length : int) -> pd.DataFrame:
    features = df[col].apply(_get_text_embed, args=(model, tokenizer, max_length))
    np_to_pd : list = []
    for feats in np.array([np.array(features)]).T:
        feat_obj : list = []
        for feat in feats:
            feat_obj.append(feat)
        np_to_pd.append(np.concatenate(feat_obj))
    columns = [prefix_column_name + '_' + str(i) for i in range(len(np_to_pd[0]))]
    return pd.DataFrame(np_to_pd, columns=columns)

# Variables

In [None]:
PATH_ZIP_FILE = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/internship_2023.zip'

prefix = '/kaggle/input/kazann/images/'
PATH_IMAGES_TRAIN = f'{prefix}train/'
PATH_IMAGES_TEST = f'{prefix}test/'

In [None]:
class param:
    is_check_code = False

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# with zipfile.ZipFile(PATH_ZIP_FILE, 'r') as zip_ref:
#     zip_ref.extractall('/content/')

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
import requests
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
model.to(device)

Downloading (…)rocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/707M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
          (position_embedding): Embedding(197, 768)
        )
        (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0): GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
                (q_pr

# Train

In [None]:
img_to_text : list = []
for cnt, path_img in tqdm(enumerate(os.listdir(PATH_IMAGES_TRAIN))):
    image = Image.open(f'{PATH_IMAGES_TRAIN}{path_img}')
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values.to(device), max_length=30)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    img_to_text.append(generated_caption)
    if param.is_check_code and cnt==2: break

4669it [13:10,  5.81it/s]

In [None]:
features_git = pd.DataFrame({'img_to_text':img_to_text })
features_git.head()

In [None]:
# @title 'microsoft/mdeberta-v3-base'
model_path = 'microsoft/mdeberta-v3-base'
name_model = 'microsoft_mdeberta_v3_base'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model_microsoft_mdeberta_v3_base = AutoModel.from_pretrained(model_path)
model_microsoft_mdeberta_v3_base.to(device)

In [None]:
features_git = get_df_text_emb(features_git, 'img_to_text', model_microsoft_mdeberta_v3_base, tokenizer, f'X_img_to_text_{name_model}', 128)

In [None]:
features_git.to_csv('/kaggle/working/train_features_git.csv', index=False)

In [None]:
features_git.head(2)

# Test

In [None]:
img_to_text : list = []
for cnt, path_img in tqdm(enumerate(os.listdir(PATH_IMAGES_TEST))):
    image = Image.open(f'{PATH_IMAGES_TEST}{path_img}')
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values.to(device), max_length=30)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    img_to_text.append(generated_caption)
    if param.is_check_code and cnt==2: break

2it [00:14,  7.20s/it]


In [None]:
features_git = pd.DataFrame({'img_to_text':img_to_text })
features_git.head()

Unnamed: 0,img_to_text
0,a set of banners for sale
1,a poster for a restaurant.
2,a pair of kitchen utensils


In [None]:
# @title 'microsoft/mdeberta-v3-base'
model_path = 'microsoft/mdeberta-v3-base'
name_model = 'microsoft_mdeberta_v3_base'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model_microsoft_mdeberta_v3_base = AutoModel.from_pretrained(model_path)
model_microsoft_mdeberta_v3_base.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(251000, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0): DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermediate): Deb

In [None]:
features_git = get_df_text_emb(features_git, 'img_to_text', model_microsoft_mdeberta_v3_base, tokenizer, f'X_img_to_text_{name_model}', 128)

In [None]:
features_git.to_csv('/kaggle/working/test_features_git.csv', index=False)

In [None]:
features_git.head(2)

Unnamed: 0,X_img_to_text_microsoft_mdeberta_v3_base_0,X_img_to_text_microsoft_mdeberta_v3_base_1,X_img_to_text_microsoft_mdeberta_v3_base_2,X_img_to_text_microsoft_mdeberta_v3_base_3,X_img_to_text_microsoft_mdeberta_v3_base_4,X_img_to_text_microsoft_mdeberta_v3_base_5,X_img_to_text_microsoft_mdeberta_v3_base_6,X_img_to_text_microsoft_mdeberta_v3_base_7,X_img_to_text_microsoft_mdeberta_v3_base_8,X_img_to_text_microsoft_mdeberta_v3_base_9,...,X_img_to_text_microsoft_mdeberta_v3_base_758,X_img_to_text_microsoft_mdeberta_v3_base_759,X_img_to_text_microsoft_mdeberta_v3_base_760,X_img_to_text_microsoft_mdeberta_v3_base_761,X_img_to_text_microsoft_mdeberta_v3_base_762,X_img_to_text_microsoft_mdeberta_v3_base_763,X_img_to_text_microsoft_mdeberta_v3_base_764,X_img_to_text_microsoft_mdeberta_v3_base_765,X_img_to_text_microsoft_mdeberta_v3_base_766,X_img_to_text_microsoft_mdeberta_v3_base_767
0,-0.000919,-0.002168,-0.005272,0.000227,-0.005942,-0.004003,0.010319,0.01405,-0.006532,0.007129,...,0.02245,0.007166,-0.000511,-0.002254,0.003884,-0.000847,-0.000633,-0.008442,-0.010409,0.015712
1,-0.001136,-0.004582,-0.006688,0.001313,-0.007222,-0.004796,0.011975,0.014479,-0.006925,0.007995,...,0.016868,0.007232,0.000297,-0.002067,0.003754,0.000552,-0.004885,-0.009062,-0.011373,0.016268
