# Проект `Movie by frame`

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [1]:
import os
import random
import shutil
import torch
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

import warnings

warnings.filterwarnings("ignore")

In [2]:
# Функция фиксирует seed для корректного сравнения оптимизаторов
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(13)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cuda'

## Подготовка датасета (`train`, `val`)

In [5]:
!gdown 'https://drive.google.com/uc?export=download&id=1-AfgwyKszlHKuAUeqrApnq5A844IcIuR' -O "/content/"

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1-AfgwyKszlHKuAUeqrApnq5A844IcIuR
From (redirected): https://drive.google.com/uc?export=download&id=1-AfgwyKszlHKuAUeqrApnq5A844IcIuR&confirm=t&uuid=cae397e2-4ec4-40ae-980b-1201d0ff228b
To: /content/movies_all_processed.zip
100% 3.51G/3.51G [01:36<00:00, 36.3MB/s]


In [6]:
%%capture

!unzip -q "/content/movies_all_processed.zip" -d "/content/movies_all_processed"

In [4]:
DATASET_ROOT = Path("/content/movies_all_processed")
TRAIN_DIR = Path("/content/dataset_train")
VAL_DIR = Path("/content/dataset_val")

In [8]:
MOVIE_FOLDERS = [p for p in DATASET_ROOT.iterdir() if p.is_dir()]
train_files, val_files = [], []

for movie_folder in MOVIE_FOLDERS:
    images = list(movie_folder.rglob("*"))
    train_imgs, val_imgs = train_test_split(images, test_size=0.2, random_state=42)
    train_files.extend([(img, movie_folder.name) for img in train_imgs])
    val_files.extend([(img, movie_folder.name) for img in val_imgs])

def copy_files(file_list, dest_dir):
    dest_dir.mkdir(parents=True, exist_ok=True)
    for img_path, movie_name in file_list:
        movie_dest = dest_dir / movie_name
        movie_dest.mkdir(parents=True, exist_ok=True)
        shutil.copy(img_path, movie_dest / img_path.name)

# Копируем файлы
copy_files(train_files, TRAIN_DIR)
copy_files(val_files, VAL_DIR)

print(f"Train: {len(train_files)} images, Val: {len(val_files)} images")

Train: 59195 images, Val: 14928 images


## Подготовка класса `Dataset`

In [5]:
class MovieDataset(Dataset):
    def __init__(self, root_dir, processor):
        self.root_dir = Path(root_dir)
        self.samples = []
        self.processor = processor
        self.label2idx = {label: idx for idx, label in enumerate(sorted(os.listdir(self.root_dir)))}
        self.idx2label = {idx: label for label, idx in self.label2idx.items()}

        for movie_folder in self.root_dir.iterdir():
            for img_path in movie_folder.rglob("*"):
                self.samples.append((img_path, movie_folder.name))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, movie_name = self.samples[idx]
        image = Image.open(img_path).convert("RGB")
        text = f"This is a movie {movie_name}."

        inputs = self.processor(text=text, images=image, return_tensors="pt", padding='max_length')
        return inputs


## Дообучение `CLIP`

In [6]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlolbl4[0m ([33mlolbl4-hse[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
!pip install pytorch-lightning



In [17]:
import pytorch_lightning as pl

class CLIPFineTuner(pl.LightningModule):
    def __init__(self, *, model_name, lr=1e-6):
        super().__init__()
        self.save_hyperparameters()
        self.model = CLIPModel.from_pretrained(f"openai/{model_name}")
        self.model.to(device)
        self.lr = lr

    def forward(self, images, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=images)

    def accuracy_at_k(self, logits_per_image, k=1):
        top_k_preds = torch.topk(logits_per_image, k, dim=1).indices
        correct = top_k_preds == torch.arange(logits_per_image.size(0), device=device).unsqueeze(1)
        return correct.float().mean()

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

        labels = torch.arange(logits_per_image.size(0), device=device)
        loss_img = nn.CrossEntropyLoss()(logits_per_image, labels)
        loss_txt = nn.CrossEntropyLoss()(logits_per_text, labels)

        loss = (loss_img + loss_txt) / 2
        acc = self.accuracy_at_k(logits_per_image)
        self.log('train_loss', loss, prog_bar=True, on_epoch=True, logger=True)
        self.log('train_acc', acc, prog_bar=True, on_epoch=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

        labels = torch.arange(logits_per_image.size(0), device=device)
        loss_img = nn.CrossEntropyLoss()(logits_per_image, labels)
        loss_txt = nn.CrossEntropyLoss()(logits_per_text, labels)

        loss = (loss_img + loss_txt) / 2
        acc = self.accuracy_at_k(logits_per_image)
        self.log('val_loss', loss, prog_bar=True, on_epoch=True, logger=True)
        self.log('val_acc', acc, prog_bar=True, on_epoch=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr, weight_decay=1e-2)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
        return [optimizer], [scheduler]


In [14]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint

BATCH_SIZE = 64

def train_model(run_name, model_name, epochs=5):
    seed_everything(13)

    processor = CLIPProcessor.from_pretrained(f"openai/{model_name}")
    train_dataset = MovieDataset(TRAIN_DIR, processor)
    val_dataset = MovieDataset(VAL_DIR, processor)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = CLIPFineTuner(model_name=model_name)

    wandb_logger = WandbLogger(log_model='all', project="movie-by-frame", name=run_name)
    checkpoint_callback = ModelCheckpoint(
        monitor="val_acc", mode="max",
        save_top_k=1, filename=f"{run_name}-best",
        save_last=True
    )

    trainer = pl.Trainer(
        max_epochs=epochs, accelerator=device,
        logger=wandb_logger, enable_model_summary=False,
        callbacks=[checkpoint_callback], log_every_n_steps=15
    )
    trainer.fit(model, train_dataloader, val_dataloader)

    best_model_path = checkpoint_callback.best_model_path
    best_model = CLIPFineTuner.load_from_checkpoint(best_model_path)

    print(trainer.validate(best_model, val_dataloader))
    model_weights_path = f"/content/drive/MyDrive/weights/{run_name}-best-weights.pt"
    torch.save(best_model.state_dict(), model_weights_path)

    wandb.finish()


In [18]:
run_name = "clip-vit-b32-movies-all"
train_model(run_name, "clip-vit-base-patch32")

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 4.5998334884643555, 'val_acc': 0.03242229297757149}]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▇▇▇▇▇█
train_acc_epoch,▁▆▇██
train_acc_step,▁▁▂▃▃▂▄▆▅▆▆▅▅▅▇▆▆▇▆█▆▆▇▅▆▇▆▆▆▇▇▇▇▇▆▇▇█▇▇
train_loss_epoch,█▃▂▁▁
train_loss_step,▇██▅▅▆▄▄▅▄▃▂▃▃▃▂▂▂▂▃▂▂▁▂▃▂▂▁▂▂▂▁▂▁▂▂▁▂▁▂
trainer/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇████
val_acc,▁███▆█
val_loss,▁▆▄▆█▆

0,1
epoch,5.0
train_acc_epoch,0.84663
train_acc_step,0.82812
train_loss_epoch,0.33253
train_loss_step,0.38082
trainer/global_step,4625.0
val_acc,0.03242
val_loss,4.59983
