# Fine-Tuning + Inference API 

## 1. Подготовка данных. Для обучения использован поднабор RVL-CDIP (тестовая часть), так как официальный train set недоступен. Разделение выполнено стратифицированно

### Создание датасета для тренировки

In [1]:
import os
import shutil

In [2]:
classes = ['invoice', 'letter', 'email', 'news_article']
number_of_files = 2000

In [None]:
# os.chdir('data')
# os.mkdir('train')

In [3]:
train_path = os.path.join('data', 'train')
test_path = os.path.join('data', 'test')

In [None]:
# os.chdir(os.path.join('..', '..'))
# os.getcwd()

'/home/ilgiz/ml-engineering-portfolio/doc_image_classification'

In [None]:
# os.chdir(dest_path)
# for i in range(len(classes)):
#     os.mkdir(classes[i])

# os.chdir(os.path.join('..', '..'))

In [25]:
os.chdir(os.path.join('..', '..'))
print(os.getcwd())

/home/ilgiz/ml-engineering-portfolio/doc_image_classification


In [38]:
# os.listdir(os.path.join(src_path, classes[0]))[0]
os.getcwd()

'/home/ilgiz/ml-engineering-portfolio/doc_image_classification'

In [None]:
os.path.join(train_path, classes[0])

In [None]:
file = os.listdir(os.path.join(test_path, classes[0]))[0]
shutil.copy(os.path.join(test_path, classes[0], file), os.path.join(train_path, classes[0]))

'data/train/invoice/ti16311152.tif'

In [None]:
# for c in classes:
#     for n in range(number_of_files):
#         file = os.listdir(os.path.join(src_path, c))[n]
#         shutil.copy(os.path.join(src_path, c, file), os.path.join(dest_path, c))

In [None]:
for c in classes:
    print(f'Class: {c} | amount of objects: {len(os.listdir(os.path.join(train_path, c)))}')

Class: invoice | amount of objects: 2000
Class: letter | amount of objects: 2000
Class: email | amount of objects: 2000
Class: news_article | amount of objects: 2000


In [None]:
os.listdir(os.path.join(test_path, classes[0]))[0]

'ti16311152.tif'

### Создание `transform` и своего класса `Dataset`

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from torchvision.transforms import v2

In [9]:
transform_train = v2.Compose([
    v2.ToImage(),    # Конвертируем PIL -> image
    v2.ToDtype(torch.float32, scale=True),   # Масштабируем [0, 255] -> [0, 1]
    v2.Resize((224, 224), antialias=True), 
    v2.RandomAffine(degrees=3, translate=(.05, .05), scale=(.95, 1.05)),    # лёгкий поворот + сдвиг + масштаб
    v2.GaussianNoise(mean=.0, sigma=.01),
    v2.Normalize([.5, .5, .5], [.5, .5, .5])
])

transform_val = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize((224, 224), antialias=True),
    v2.Normalize([.5, .5, .5], [.5, .5, .5])
])

In [None]:
class doc_ds(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, index):
        img = Image.open(self.image_paths[index]).convert('RGB')
        if self.transform: img = self.transform(img)
        label = torch.tensor(self.labels[index], dtype=torch.long)
        
        return img, label

In [11]:
df_docs = pd.DataFrame()

In [12]:
df_t1 = pd.DataFrame(columns=['doc_path', 'label'])
df_t2 = pd.DataFrame(columns=df_t1.columns)

In [17]:
classes

['invoice', 'letter', 'email', 'news_article']

In [36]:
class_rpath = os.path.relpath(os.path.join(train_path, classes[0]))
class_rpath

'data/train/invoice'

### Загрузка файлов и разделение на `train`, `validation`, `test`

In [None]:
tvt_files_df = pd.DataFrame(columns=['path', 'labels'])
for c in classes:
    class_rpath = os.path.relpath(os.path.join(train_path, c))
    temp_df = pd.DataFrame(columns=['path', 'labels'])
    temp_df['path'] = class_rpath + '/' + pd.Series(os.listdir(os.path.join(train_path, c)))
    temp_df['labels'] = c
    
    tvt_files_df = pd.concat((tvt_files_df, temp_df), axis=0, ignore_index=True)

tvt_files_df

# tvt_files_df # train, validate, test - data
# tvt_files_df


# invoice_files = pd.Series(os.listdir(os.path.join(train_path, classes[0])))
# letter_files = pd.Series(os.listdir(os.path.join(train_path, classes[1])))

# pd.concat((invoice_files, letter_files), ignore_index=True)

Unnamed: 0,path,labels
0,data/train/invoice/ti16311152.tif,invoice
1,data/train/invoice/2084022630.tif,invoice
2,data/train/invoice/2063235294.tif,invoice
3,data/train/invoice/83545557.tif,invoice
4,data/train/invoice/2029370755.tif,invoice
...,...,...
7995,data/train/news_article/tob14401.20.tif,news_article
7996,data/train/news_article/1003289799.tif,news_article
7997,data/train/news_article/2048367429.tif,news_article
7998,data/train/news_article/1002402701a.tif,news_article


In [146]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [150]:
tvt_files_df['label_id'] = le.fit_transform(tvt_files_df['labels'])
tvt_files_df

Unnamed: 0,path,labels,label_id
0,data/train/invoice/ti16311152.tif,invoice,1
1,data/train/invoice/2084022630.tif,invoice,1
2,data/train/invoice/2063235294.tif,invoice,1
3,data/train/invoice/83545557.tif,invoice,1
4,data/train/invoice/2029370755.tif,invoice,1
...,...,...,...
7995,data/train/news_article/tob14401.20.tif,news_article,3
7996,data/train/news_article/1003289799.tif,news_article,3
7997,data/train/news_article/2048367429.tif,news_article,3
7998,data/train/news_article/1002402701a.tif,news_article,3


In [151]:
tvt_files_df['label_id'].unique()

array([1, 2, 0, 3])

In [None]:
from torch.utils.data import random_split

In [152]:
full_doc_ds = doc_ds(tvt_files_df.path.to_list(), tvt_files_df.label_id.to_list(), transform=transform_train)

In [153]:
train_ds, val_ds, test_ds = random_split(full_doc_ds, [.7, .15, .15])

In [154]:
batch_size = 64

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

## fine-tuning модели

### Загрузка модели, заморозка слоев, добавление поледнего линейного слоя

In [117]:
import timm
import torch.nn as nn

In [118]:
model = timm.create_model('efficientnet_b0', pretrained=True)


for param in model.parameters():    # Замораживаем слои
    param.requires_grad = False

model.classifier = nn.Linear(model.classifier.in_features, 4)     # Добавляем новый полносвязный слой с активными градиентами


# Обображаем параметры модели
def show_model_info(model):
    for k, v in model.default_cfg.items():
        print(f'{k}: {v}')


show_model_info(model)

url: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth
hf_hub_id: timm/efficientnet_b0.ra_in1k
architecture: efficientnet_b0
tag: ra_in1k
custom_load: False
input_size: (3, 224, 224)
fixed_input_size: False
interpolation: bicubic
crop_pct: 0.875
crop_mode: center
mean: (0.485, 0.456, 0.406)
std: (0.229, 0.224, 0.225)
num_classes: 1000
pool_size: (7, 7)
first_conv: conv_stem
classifier: classifier
license: apache-2.0


In [119]:
print([p.requires_grad for p in model.parameters()][-5:])  # последние 5 тензоров — должны быть True (голова)

[False, False, False, True, True]


### Обучение модели

In [132]:
epochs = 6
learning_rate = 1e-3    # 1e-3 - 3e-4
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)

In [130]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [131]:
model = model.to(device)

In [159]:
def train_model(model, loader, optimizer, criterion):
    total_loss, correct_preds = 0, 0
    for imgs, targets in loader:
        imgs = imgs.to(device); targets = targets.to(device)
        
        optimizer.zero_grad()
        
        preds = model(imgs)
        loss = criterion(preds, targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * imgs.size(0)
        correct_preds += (preds.argmax(dim=1) == targets).sum().item()
        
    return total_loss / len(loader.dataset), correct_preds / len(loader.dataset)            

In [160]:
def evaluate_model(model, loader, criterion):
    total_loss, correct_preds = 0, 0
    torch.inference_mode()
    for imgs, targets in loader:
        imgs = imgs.to(device); targets = targets.to(device)
        
        preds = model(imgs)
        loss = criterion(preds, targets)
        
        total_loss += loss.item() * imgs.size(0)
        correct_preds += (preds.argmax(dim=1) == targets).sum().item()  
        
    
    return total_loss / len(loader.dataset), correct_preds / len(loader.dataset)   

### Обучение модели

In [139]:
from tqdm import tqdm    # Полоска загрузки

In [None]:
total_train_acc, total_val_acc = [], []
total_train_loss, total_val_loss = [], []

for e in range(epochs):
    train_loss, train_acc = train_model(model, train_dl, optimizer, criterion)
    val_loss, val_acc = evaluate_model(model, val_dl, criterion)
    
    total_train_acc.append(train_acc); total_train_loss.append(train_loss)
    total_val_acc.append(val_acc); total_val_loss.append(val_acc)
    
    print(f"Epoch: {e+1}/{epochs} | Train accuracy: {train_acc:.4f}%, Train loss: {train_loss:.4f} | Validation accuracy: {val_acc:.4f}%, Validation loss: {val_loss:.4f}")

Epoch: 1/6 | Train accuracy: 0.0000%, Train loss: 0.7872 | Validation accuracy: 0.0000%, Validation loss: 0.6482


KeyboardInterrupt: 