# **main.py**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import random

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torchvision import transforms
from torch.utils.data import DataLoader

from transformers import ViTModel, BertModel, BertTokenizer

import time
import gc
import numpy as np
from PIL import Image
import pandas as pd
from statistics import mode

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
!mkdir data

In [6]:
import zipfile
import os

import shutil

# コピー元のファイルパス
train_file_path = '/content/drive/MyDrive/Colab Notebooks/DLBasics2024_colab/dl_lecture_competition_pub/data/train.json'
valid_file_path = '/content/drive/MyDrive/Colab Notebooks/DLBasics2024_colab/dl_lecture_competition_pub/data/valid.json'
# コピー先のファイルパス
destination_train_file_path = 'data/train.json'
destination_valid_file_path = 'data/valid.json'

shutil.copyfile(train_file_path, destination_train_file_path)
shutil.copyfile(valid_file_path, destination_valid_file_path)

# アップロードしたzipファイルのパス
train_zip_file_path = '/content/drive/MyDrive/Colab Notebooks/DLBasics2024_colab/dl_lecture_competition_pub/data/train.zip'
valid_zip_file_path = '/content/drive/MyDrive/Colab Notebooks/DLBasics2024_colab/dl_lecture_competition_pub/data/valid.zip'

# 解凍先のディレクトリ
extract_dir = 'data'

# 解凍処理
with zipfile.ZipFile(train_zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
with zipfile.ZipFile(valid_zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"ファイルを {extract_dir} に解凍しました。")

ファイルを data に解凍しました。


In [7]:
!pip install nltk
# !pip install pyspellchecker
!pip install transformers
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->efficientnet_pytorch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->efficientnet_pytorch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->efficientnet_pytorch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->efficientnet_pytorch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->efficientnet_pytorch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# from spellchecker import SpellChecker
from nltk.corpus import stopwords

# スペルチェッカーとストップワードの準備
# spell = SpellChecker()
stop_words = set(stopwords.words('english'))

def process_text(text):
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        r'\bzero\b': '0', r'\bone\b': '1', r'\btwo\b': '2', r'\bthree\b': '3', r'\bfour\b': '4',
        r'\bfive\b': '5', r'\bsix\b': '6', r'\bseven\b': '7', r'\beight\b': '8', r'\bnine\b': '9',
        r'\bten\b': '10'
    }
    for word, digit in num_word_to_digit.items():
        text = re.sub(word, digit, text)

    # 小数点のピリオドを削除
    text = re.sub(r'(?<!\d)\.(?!\d)', '', text)

    # 冠詞の削除
    text = re.sub(r'\b(a|an|the)\b', '', text)

    # 短縮形のカンマの追加
    contractions = {
        r'\bdont\b': "don't", r'\bisnt\b': "isn't", r'\barent\b': "aren't", r'\bwont\b': "won't",
        r'\bcant\b': "can't", r'\bwouldnt\b': "wouldn't", r'\bcouldnt\b': "couldn't"
    }
    for contraction, correct in contractions.items():
        text = re.sub(contraction, correct, text)

    # スペルチェックと修正
#     corrected_text = []
#     for word in text.split():
#         corrected_word = spell.correction(word)
#         if corrected_word is None:
#             corrected_word = word
#         corrected_text.append(corrected_word)
#     text = ' '.join(corrected_text)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", ' ', text)

    # ストップワードの削除
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # 連続するスペースを1つに変換
    text = re.sub(r'\s+', ' ', text).strip()

    return text


## 1. データローダーの作成

In [10]:
from transformers import BertTokenizer
import torch
from PIL import Image
import pandas as pd
import numpy as np
from statistics import mode

class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True, max_length=128):
        self.transform = transform
        self.image_dir = image_dir
        self.df = pd.read_json(df_path)
        self.answer = answer
        self.max_length = max_length

        # BERTトークナイザーの初期化
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # answerの辞書を作成
        self.answer2idx = {}
        self.idx2answer = {}

        if self.answer:
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}

    def update_dict(self, dataset):
        self.answer2idx = dataset.answer2idx
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image) if self.transform else image

        question = self.df["question"][idx]
        question = process_text(question)

        # BERTトークナイザーを使用して質問をエンコード
        question_encoded = self.tokenizer(question,
                                          padding='max_length',
                                          max_length=self.max_length,
                                          truncation=True,
                                          return_tensors="pt")

        # テンソルをスクイーズして余分な次元を削除
        question_encoded = {k: v.squeeze(0) for k, v in question_encoded.items()}

        if self.answer:
            answers = [self.answer2idx[process_text(answer["answer"])] for answer in self.df["answers"][idx]]
            mode_answer_idx = mode(answers)
            return image, question_encoded, torch.LongTensor(answers), torch.tensor(int(mode_answer_idx))
        else:
            return image, question_encoded

    def __len__(self):
        return len(self.df)

## 2. 評価指標の実装

In [11]:
# 簡単にするならBCEを利用する
def VQA_criterion(batch_pred: torch.Tensor, batch_answers: torch.Tensor):
    total_acc = 0.

    for pred, answers in zip(batch_pred, batch_answers):
        acc = 0.
        for i in range(len(answers)):
            num_match = 0
            for j in range(len(answers)):
                if i == j:
                    continue
                if pred == answers[j]:
                    num_match += 1
            acc += min(num_match / 3, 1)
        total_acc += acc / 10

    return total_acc / len(batch_pred)

## 3. モデルの実装

In [12]:
import torch
import torch.nn as nn
from efficientnet_pytorch import EfficientNet
from transformers import BertModel, BertTokenizer

class ImprovedVQAModel(nn.Module):
    def __init__(self, n_answer, efficientnet_version='efficientnet-b0'):
        super().__init__()

        # EfficientNet for image feature extraction
        self.efficientnet = EfficientNet.from_pretrained(efficientnet_version)
        self.efficientnet._fc = nn.Identity()  # Remove the last fully connected layer

        # Get the output features of EfficientNet
        self.efficientnet_out_features = self.efficientnet._conv_head.out_channels

        # BERT for question encoding
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_out_features = self.bert.config.hidden_size

        # Projection layers
        self.image_projection = nn.Linear(self.efficientnet_out_features, 768)
        self.question_projection = nn.Linear(self.bert_out_features, 768)

        # Fusion and classification layers
        self.fusion = nn.MultiheadAttention(embed_dim=768, num_heads=8, batch_first=True)
        self.classifier = nn.Linear(768, n_answer)

        self.dropout = nn.Dropout(0.5)

    def forward(self, image, question_encoded):
        # Image encoding
        image_features = self.efficientnet(image)
        image_features = self.image_projection(image_features).unsqueeze(1)  # [batch_size, 1, 768]

        # Question encoding
        question_features = self.bert(**question_encoded).last_hidden_state
        question_features = self.question_projection(question_features)  # [batch_size, seq_len, 768]

        # Fusion
        fused_features, _ = self.fusion(image_features, question_features, question_features)
        fused_features = self.dropout(fused_features)

        # Classification
        output = self.classifier(fused_features.mean(dim=1))

        return output

## 4. 学習の実装

In [13]:
import time
import torch
import gc
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_acc = 0
    simple_acc = 0
    n = 0
    start = time.time()
    for image, question_encoded, answers, mode_answer in dataloader:
        image = image.to(device)
        question_encoded = {k: v.to(device) for k, v in question_encoded.items()}
        answers = answers.to(device)
        mode_answer = mode_answer.to(device)

        optimizer.zero_grad()
        with autocast():
            pred = model(image, question_encoded)
            loss = criterion(pred, mode_answer)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        # デバッグ情報
        # for name, param in model.named_parameters():
        #     if param.grad is not None:
        #         print(f"{name} - grad norm: {param.grad.norm()}")

        total_loss += loss.item()
        total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
        simple_acc += (pred.argmax(1) == mode_answer).float().mean().item()  # simple accuracy
        n += 1
        print(f" [{n}/{len(dataloader)}] ", end="\r")

        # メモリをクリア
        torch.cuda.empty_cache()
        gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start

def eval(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_acc = 0
    simple_acc = 0
    start = time.time()
    with torch.no_grad():
        for image, question_encoded, answers, mode_answer in dataloader:
            image = image.to(device)
            question_encoded = {k: v.to(device) for k, v in question_encoded.items()}
            answers = answers.to(device)
            mode_answer = mode_answer.to(device)

            with autocast():
                pred = model(image, question_encoded)
                loss = criterion(pred, mode_answer)

            total_loss += loss.item()
            total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
            simple_acc += (pred.argmax(1) == mode_answer).mean().item()  # simple accuracy

            # メモリをクリア
            torch.cuda.empty_cache()
            gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start

In [14]:
# deviceの設定
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

# 画像のデータ拡張
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # ランダムにリサイズしてクロップ
    transforms.RandomHorizontalFlip(),  # 水平方向にランダムに反転
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # カラージッタ
    transforms.RandomRotation(10),  # ランダムに回転
    transforms.ToTensor(),  # テンソルに変換
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 正規化
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# データセットの初期化
train_dataset = VQADataset(df_path="/content/data/train.json", image_dir="/content/data/train", transform=train_transform)
test_dataset = VQADataset(df_path="/content/data/valid.json", image_dir="/content/data/valid", transform=test_transform, answer=False)
test_dataset.update_dict(train_dataset)

# データローダーの初期化
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

# モデルの初期化
model = ImprovedVQAModel(n_answer=len(train_dataset.answer2idx)).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b0-355c32eb.pth
100%|██████████| 20.4M/20.4M [00:00<00:00, 412MB/s]


Loaded pretrained weights for efficientnet-b0


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
# model = VQAModel(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)
# model = VQARes101Model(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)
# model = ImprovedVQAModel(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)

# optimizer / criterion
num_epoch = 10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

In [18]:
# train model
for epoch in range(num_epoch):
    train_loss, train_acc, train_simple_acc, train_time = train(model, train_loader, optimizer, criterion, device)
    print(f"【{epoch + 1}/{num_epoch}】\n"
            f"train time: {train_time:.2f} [s]\n"
            f"train loss: {train_loss:.4f}\n"
            f"train acc: {train_acc:.4f}\n"
            f"train simple acc: {train_simple_acc:.4f}")

# 提出用ファイルの作成
model.eval()
submission = []
with torch.no_grad():
    for image, question_encoded in test_loader:
        image = image.to(device)
        question_encoded = {k: v.to(device) for k, v in question_encoded.items()}

        pred = model(image, question_encoded)
        pred = pred.argmax(1).cpu().tolist()  # バッチ処理に対応
        submission.extend(pred)

submission = [train_dataset.idx2answer[id] for id in submission]
submission = np.array(submission)
torch.save(model.state_dict(), "model.pth")
np.save("submission.npy", submission)

【1/10】
train time: 232.92 [s]
train loss: 4.3260
train acc: 0.4941
train simple acc: 0.4255
【2/10】
train time: 233.35 [s]
train loss: 4.2952
train acc: 0.4923
train simple acc: 0.4249
【3/10】
train time: 232.95 [s]
train loss: 4.2415
train acc: 0.4911
train simple acc: 0.4247
【4/10】
train time: 233.33 [s]
train loss: 4.1956
train acc: 0.4914
train simple acc: 0.4267
【5/10】
train time: 232.59 [s]
train loss: 4.1600
train acc: 0.4929
train simple acc: 0.4294
【6/10】
train time: 232.93 [s]
train loss: 4.1094
train acc: 0.4935
train simple acc: 0.4308
【7/10】
train time: 233.68 [s]
train loss: 4.0710
train acc: 0.4928
train simple acc: 0.4316
【8/10】
train time: 233.69 [s]
train loss: 4.0362
train acc: 0.4923
train simple acc: 0.4320
【9/10】
train time: 233.09 [s]
train loss: 4.0552
train acc: 0.4885
train simple acc: 0.4283
【10/10】
train time: 233.82 [s]
train loss: 4.0581
train acc: 0.4856
train simple acc: 0.4258
