# Task undestanding

Идея в том, чтобы классифицировать категории по картинкам и если будет хоть сколько-нибудь минимальный успех, то взять выходы модели в качестве признаков.

PS: Надежды что обычный классификатор справится, у меня нет.

# Library

In [None]:
%%capture
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install sentencepiece==0.1.97

In [None]:
import re
import os
import shutil
import random
import time
import zipfile
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torchvision import datasets, transforms
from torch.nn import functional as F
import torchvision as tv

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Variables

In [None]:
PATH_ZIP_FILE = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/internship_2023.zip'

prefix = ''
PATH_IMAGES_TRAIN = '/content/images/train/'
PATH_IMAGES_TEST = '/content/images/test/'

PATH_IMAGES_X_TRAIN = '/content/images/X_train/'
PATH_IMAGES_X_VAL = '/content/images/X_val/'

PATH_IMAGES_TRAIN_CLS = '/content/images/train_cls/'
PATH_IMAGES_VAL_CLS = '/content/images/val_cls/'
PATH_IMAGES_TEST_CLS = '/content/images/test_cls/'

PATH_IMAGES_FEATURE_CLS = '/content/images/feature_cls/'

PATH_CLASSES = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/X/classes.npy'

BATCH_SIZE = 64

In [None]:
class param:
    is_check_code = False
    is_pretrain_df = False
    test_size = 0.33
    lr = 3e-4
    num_epochs = 100
    seed = 42
    weight_decay = 1e-6
    dropout = 0.5
    num_classes = 874

# Helper functions

In [None]:
def copy_files(PATH_IMAGES_CLS: str, PATH_IMAGES: str, df: pd.DataFrame) -> None:
    try:
        os.mkdir(PATH_IMAGES_CLS)
    except: pass
    for i in df['target'].unique():
        try:
            os.mkdir(f'{PATH_IMAGES_CLS}{str(i)}')
        except: pass
    folder_to = PATH_IMAGES_CLS
    folder_from = PATH_IMAGES
    for f in os.listdir(folder_from):
        if os.path.isfile(os.path.join(folder_from, f)):
            prod_id = f.split('.')[0]###
            cat_id = int(df[df['product_id']==int(prod_id)]['target'].values)
            try:
                shutil.copy(os.path.join(folder_from, f), os.path.join(folder_to+str(cat_id), f))
            except: pass
        if os.path.isdir(os.path.join(folder_from, f)):
            os.system(f'rd /S /Q {folder_to+str(cat_id)}\\{f}')
            shutil.copytree(os.path.join(folder_from, f), os.path.join(folder_to+str(cat_id), f))

In [None]:
def evaluate_accuracy(data_iter, net, loss, device):

    val_loss, val_iters = 0., 0.
    val_y_true, val_y_pred = [], [] 
    net.eval()
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_logits = net(X)
        y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
        l = loss(y_logits, y)
        val_y_true += y
        val_y_pred += y_pred
        val_loss += l.item()
        val_iters += 1
    return val_y_true, val_y_pred, val_loss, val_iters

In [None]:
def training(net, train_iter, test_iter, optimizer, num_epochs, device):
    loss = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        train_loss, train_iters = 0., 0.
        train_y_true, train_y_pred = [], []
        start=time.time()

        net.train()
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y)
            y_pred = torch.softmax(y_hat, dim=1).argmax(dim=1)
            l.backward()
            optimizer.step()
            train_y_true += y
            train_y_pred += y_pred
            train_loss += l.item()
            train_iters += 1

        val_y_true, val_y_pred, val_loss, val_iters = evaluate_accuracy(test_iter, net, loss, device)

        message = (f"ep: {epoch}, taked: {time.time() - start:.3f},"
            f" train_loss: {train_loss / train_iters:.3f},"
            f" train_f1_score: {f1_score(torch.Tensor(train_y_true).cpu(), torch.Tensor(train_y_pred).cpu(), average='weighted'):.3f},"
            f" val_loss: {val_loss / val_iters:.3f},"
            f" val_f1_score: {f1_score(torch.Tensor(val_y_true).cpu(), torch.Tensor(val_y_pred).cpu(), average='weighted'):.3f}"
            )  
        print(message)  

In [None]:
def prepare_img_files(PATH_IMAGES_CLS: str, PATH_IMAGES: str, df: pd.DataFrame) -> None:
    try:
        os.mkdir(PATH_IMAGES_CLS)
        for i in df['target_product_id']:
            os.mkdir(f'{PATH_IMAGES_CLS}{str(i)}')
    except: pass
    folder_to = PATH_IMAGES_CLS
    folder_from = PATH_IMAGES
    for f in os.listdir(folder_from):
        if os.path.isfile(os.path.join(folder_from, f)):
            product_id = f.split('.')[0]
            prod_id = int(df[df['product_id']==int(product_id)]['target_product_id'].values)
            try:
                shutil.copy(os.path.join(folder_from, f), os.path.join(folder_to+str(prod_id), f))
            except: pass
        if os.path.isdir(os.path.join(folder_from, f)):
            os.system(f'rd /S /Q {folder_to+str(prod_id)}\\{f}')
            shutil.copytree(os.path.join(folder_from, f), os.path.join(folder_to+str(prod_id), f))

In [None]:
def get_df_vgg16_emb(features : np.array, prefix_column_name : str) -> pd.DataFrame:
    np_to_pd : list = []
    for feats in np.array([features]).T:
        feat_obj : list = []
        for feat in feats:
            feat_obj.append(feat)
        np_to_pd.append(np.concatenate(feat_obj))
    columns = [prefix_column_name + '_' + str(i) for i in range(len(np_to_pd[0]))]
    return pd.DataFrame(np_to_pd, columns=columns)

# Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with zipfile.ZipFile(PATH_ZIP_FILE, 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
df = pd.read_parquet('/content/train.parquet', engine='pyarrow')
le = LabelEncoder()
le.classes_ = np.load(PATH_CLASSES)
df['target'] = le.fit_transform(df['category_id'])
if param.is_check_code: df = df.head(500)
df.head(2)

Unnamed: 0,product_id,category_id,sale,shop_id,shop_title,rating,text_fields,category_name,target
0,325286,12171,False,9031,Aksik,5.0,"{""title"": ""Зарядный кабель Borofone BX1 Lightn...",Все категории->Электроника->Смартфоны и телефо...,251
1,888134,14233,False,18305,Sela,5.0,"{""title"": ""Трусы Sela"", ""description"": ""Трусы-...",Все категории->Одежда->Женская одежда->Белье и...,748


## Split train and val

In [None]:
print(df.shape)
X_one_example = df[df['target'].isin(df['target'].value_counts().loc[lambda x: x == 1].index.values)]
df_for_train = df.drop(labels=list(X_one_example.index.values), axis=0)
print(df_for_train.shape)

(91120, 9)
(91116, 9)


In [None]:
y = df_for_train[['target']]        
X = df_for_train
X.shape, y.shape

((91116, 9), (91116, 1))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=param.test_size, random_state=param.seed, stratify=y, shuffle=True)
len(X_train), len(X_val), len(y_train), len(y_val)

(61047, 30069, 61047, 30069)

In [None]:
y_one_example = X_one_example[['target']]
X_one_example = X_one_example

X_train = pd.concat([X_train, X_one_example], axis=0)
print(X_train.shape)

y_train = pd.concat([y_train, y_one_example], axis=0)
print(y_train.shape)

(61051, 9)
(61051, 1)


In [None]:
dest = PATH_IMAGES_X_TRAIN
try:
    os.mkdir(PATH_IMAGES_X_TRAIN)
except: pass
for img_name in X_train['product_id']:
    src = f'{PATH_IMAGES_TRAIN}{str(img_name)}.jpg'
    shutil.copy2(src, dest)
len(os.listdir(PATH_IMAGES_X_TRAIN))

61051

In [None]:
dest = PATH_IMAGES_X_VAL
try:
    os.mkdir(PATH_IMAGES_X_VAL)
except: pass
for img_name in X_val['product_id']:
    src = f'{PATH_IMAGES_TRAIN}{str(img_name)}.jpg'
    shutil.copy2(src, dest)
len(os.listdir(PATH_IMAGES_X_VAL))

30069

In [None]:
copy_files(PATH_IMAGES_TRAIN_CLS, PATH_IMAGES_X_TRAIN, X_train)
len(os.listdir(PATH_IMAGES_TRAIN_CLS))

874

In [None]:
k = 0
for i in os.listdir('/content/images/train_cls/'):
    k += len(os.listdir(f'/content/images/train_cls/{i}'))
k

61051

In [None]:
copy_files(PATH_IMAGES_VAL_CLS, PATH_IMAGES_X_VAL, X_val)
len(os.listdir(PATH_IMAGES_VAL_CLS))

870

In [None]:
k = 0
for i in os.listdir('/content/images/val_cls/'):
    k += len(os.listdir(f'/content/images/val_cls/{i}'))
k

30069

# Data preparation

In [None]:
data_transforms = {
    'base': transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        }

In [None]:
train_dataset = datasets.ImageFolder(PATH_IMAGES_TRAIN_CLS, data_transforms['base'])
val_dataset = datasets.ImageFolder(PATH_IMAGES_VAL_CLS, data_transforms['base'])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Modeling and evaluation

In [None]:
model_vgg16_fine = tv.models.vgg16(pretrained=True)
model_vgg16_fine

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


  0%|          | 0.00/528M [00:00<?, ?B/s]

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
for par in model_vgg16_fine.parameters():
    par.requires_grad = False

In [None]:
model_vgg16_fine.classifier = torch.nn.Sequential(
    torch.nn.Dropout(0.3),
    torch.nn.Linear(in_features=25088, out_features=4096, bias=True),
    torch.nn.BatchNorm1d(4096),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.3),
    torch.nn.Linear(in_features=4096, out_features=2048, bias=True),
    torch.nn.BatchNorm1d(2048),
    torch.nn.ReLU(),
    torch.nn.Linear(in_features=2048, out_features=param.num_classes, bias=True),
    )
model_vgg16_fine.to(device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
print("Params to learn:")
params_to_update = []
for name, par in model_vgg16_fine.named_parameters():
    if par.requires_grad == True:
        params_to_update.append(par)
        print("\t",name)

Params to learn:
	 classifier.1.weight
	 classifier.1.bias
	 classifier.2.weight
	 classifier.2.bias
	 classifier.5.weight
	 classifier.5.bias
	 classifier.6.weight
	 classifier.6.bias
	 classifier.8.weight
	 classifier.8.bias


In [None]:
trainer = torch.optim.Adam(params_to_update, lr=param.lr, weight_decay=param.weight_decay)
num_epochs = param.num_epochs
training(model_vgg16_fine, train_loader, val_loader, trainer, num_epochs, device)

KeyboardInterrupt: ignored

In [None]:
# https://pytorch.org/tutorials/beginner/saving_loading_models.html
PATH = '/content/model.torch'
torch.save(model_vgg16_fine.state_dict(), PATH)

# Inferance train

In [None]:
le_product_id = LabelEncoder()
df['target_product_id'] = le_product_id.fit_transform(df['product_id'])

In [None]:
prepare_img_files(PATH_IMAGES_TEST_CLS, PATH_IMAGES_TRAIN, df)

In [None]:
image_datasets = datasets.ImageFolder(PATH_IMAGES_TEST_CLS, transform=data_transforms['base'])
dloader = torch.utils.data.DataLoader(image_datasets, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
prod_id_logits : list = []
prod_id : list = []

model_vgg16_fine.eval()
for X, y in tqdm(dloader):
    X = X.to(device)
    y_logits = model_vgg16_fine(X)
    prod_id += y
    prod_id_logits += y_logits

  0%|          | 1/1424 [01:33<36:54:42, 93.38s/it]


KeyboardInterrupt: ignored

In [None]:
features_vgg16 : list = []
for i in prod_id_logits:
    features_vgg16.append(torch.tensor(i).detach().cpu().numpy())
    
product_id = le_product_id.inverse_transform(torch.LongTensor(prod_id).cpu())

  features_vgg16.append(torch.tensor(i).detach().cpu().numpy())


In [None]:
features_vgg16 : pd.DataFrame = get_df_vgg16_emb(np.array(features_vgg16).T, 'vgg16_fine')
features_vgg16 = pd.concat([features_vgg16, pd.DataFrame({'product_id': product_id})], axis=1)

In [None]:
df1 = pd.DataFrame(df['product_id']).reset_index(drop=True)
result = df1.merge(features_vgg16, how='inner', left_on='product_id', right_on='product_id')
result.shape

(64, 875)

In [None]:
result.to_csv('train_features_vgg16_fine.csv', index=False)

# Inferance test

In [None]:
df = pd.read_parquet('/content/test.parquet', engine='pyarrow')
if param.is_check_code: df = df.head(500)
df.head(2)

Unnamed: 0,product_id,sale,shop_id,shop_title,rating,text_fields
1,1997646,False,22758,Sky_Electronics,5.0,"{""title"": ""Светодиодная лента Smart led Strip ..."
2,927375,False,17729,Di-Di Market,4.405941,"{""title"": ""Стекло ПЛЕНКА керамик матовое Honor..."


In [None]:
le_product_id = LabelEncoder()
df['target_product_id'] = le_product_id.fit_transform(df['product_id'])

In [None]:
PATH_IMAGES_FEATURE_CLS = '/content/images/feature_cls/'
prepare_img_files(PATH_IMAGES_FEATURE_CLS, PATH_IMAGES_TEST, df)

In [None]:
image_datasets = datasets.ImageFolder(PATH_IMAGES_FEATURE_CLS, transform=data_transforms['base'])
dloader = torch.utils.data.DataLoader(image_datasets, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
prod_id_logits : list = []
prod_id : list = []

model_vgg16_fine.eval()
for X, y in tqdm(dloader):
    X = X.to(device)
    y_logits = model_vgg16_fine(X)
    prod_id += y
    prod_id_logits += y_logits

  1%|          | 3/264 [02:40<3:53:07, 53.59s/it]


KeyboardInterrupt: ignored

In [None]:
features_vgg16 : list = []
for i in prod_id_logits:
    features_vgg16.append(torch.tensor(i).detach().cpu().numpy())
    
product_id = le_product_id.inverse_transform(torch.LongTensor(prod_id).cpu())

  features_vgg16.append(torch.tensor(i).detach().cpu().numpy())


In [None]:
features_vgg16 : pd.DataFrame = get_df_vgg16_emb(np.array(features_vgg16).T, 'vgg16_fine')
features_vgg16 = pd.concat([features_vgg16, pd.DataFrame({'product_id': product_id})], axis=1)

In [None]:
df1 = pd.DataFrame(df['product_id']).reset_index(drop=True)
result = df1.merge(features_vgg16, how='inner', left_on='product_id', right_on='product_id')
result.shape

(192, 875)

In [None]:
result.to_csv('test_features_vgg16_fine.csv', index=False)