# **main.py**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import re
import random
import time
from statistics import mode

from PIL import Image
import numpy as np
import pandas
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torchvision.models as models

In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torchvision import transforms
from torch.utils.data import DataLoader

from transformers import ViTModel, BertModel, BertTokenizer

import time
import gc
import numpy as np
from PIL import Image
import pandas as pd
from statistics import mode

In [69]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [10]:
import zipfile
import os

# アップロードしたzipファイルのパス
train_zip_file_path = 'data/train.zip'
valid_zip_file_path = 'data/valid.zip'

# 解凍先のディレクトリ
extract_dir = 'data'

# 解凍処理
with zipfile.ZipFile(train_zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
with zipfile.ZipFile(valid_zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"ファイルを {extract_dir} に解凍しました。")

ファイルを data に解凍しました。


In [3]:
!pip install nltk
!pip install pyspellchecker
# !pip install torch==1.11.0 torchvision==0.12.0 transformers==4.16.0

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from spellchecker import SpellChecker
from nltk.corpus import stopwords

# スペルチェッカーとストップワードの準備
spell = SpellChecker()
stop_words = set(stopwords.words('english'))

def process_text(text):
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        r'\bzero\b': '0', r'\bone\b': '1', r'\btwo\b': '2', r'\bthree\b': '3', r'\bfour\b': '4',
        r'\bfive\b': '5', r'\bsix\b': '6', r'\bseven\b': '7', r'\beight\b': '8', r'\bnine\b': '9',
        r'\bten\b': '10'
    }
    for word, digit in num_word_to_digit.items():
        text = re.sub(word, digit, text)

    # 小数点のピリオドを削除
    text = re.sub(r'(?<!\d)\.(?!\d)', '', text)

    # 冠詞の削除
    text = re.sub(r'\b(a|an|the)\b', '', text)

    # 短縮形のカンマの追加
    contractions = {
        r'\bdont\b': "don't", r'\bisnt\b': "isn't", r'\barent\b': "aren't", r'\bwont\b': "won't",
        r'\bcant\b': "can't", r'\bwouldnt\b': "wouldn't", r'\bcouldnt\b': "couldn't"
    }
    for contraction, correct in contractions.items():
        text = re.sub(contraction, correct, text)

    # スペルチェックと修正
#     corrected_text = []
#     for word in text.split():
#         corrected_word = spell.correction(word)
#         if corrected_word is None:
#             corrected_word = word
#         corrected_text.append(corrected_word)
#     text = ' '.join(corrected_text)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", ' ', text)

    # ストップワードの削除
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # 連続するスペースを1つに変換
    text = re.sub(r'\s+', ' ', text).strip()

    return text


## 1. データローダーの作成

In [6]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True):
        self.transform = transform  # 画像の前処理
        self.image_dir = image_dir  # 画像ファイルのディレクトリ
        self.df = pandas.read_json(df_path)  # 画像ファイルのパス，question, answerを持つDataFrame
        self.answer = answer

        # question / answerの辞書を作成
        self.question2idx = {}
        self.answer2idx = {}
        self.idx2question = {}
        self.idx2answer = {}

        # 質問文に含まれる単語を辞書に追加
        for question in self.df["question"]:
            question = process_text(question)
            words = question.split(" ")
            for word in words:
                if word not in self.question2idx:
                    self.question2idx[word] = len(self.question2idx)
        self.idx2question = {v: k for k, v in self.question2idx.items()}  # 逆変換用の辞書(question)

        if self.answer:
            # 回答に含まれる単語を辞書に追加
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}  # 逆変換用の辞書(answer)

    def update_dict(self, dataset):
        """
        検証用データ，テストデータの辞書を訓練データの辞書に更新する．

        Parameters
        ----------
        dataset : Dataset
            訓練データのDataset
        """
        self.question2idx = dataset.question2idx
        self.answer2idx = dataset.answer2idx
        self.idx2question = dataset.idx2question
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        """
        対応するidxのデータ（画像，質問，回答）を取得．

        Parameters
        ----------
        idx : int
            取得するデータのインデックス

        Returns
        -------
        image : torch.Tensor  (C, H, W)
            画像データ
        question : torch.Tensor  (vocab_size)
            質問文をone-hot表現に変換したもの
        answers : torch.Tensor  (n_answer)
            10人の回答者の回答のid
        mode_answer_idx : torch.Tensor  (1)
            10人の回答者の回答の中で最頻値の回答のid
        """
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image)
        question = np.zeros(len(self.idx2question) + 1)  # 未知語用の要素を追加
        question_words = self.df["question"][idx].split(" ")
        for word in question_words:
            try:
                question[self.question2idx[word]] = 1  # one-hot表現に変換
            except KeyError:
                question[-1] = 1  # 未知語

        if self.answer:
            answers = [self.answer2idx[process_text(answer["answer"])] for answer in self.df["answers"][idx]]
            mode_answer_idx = mode(answers)  # 最頻値を取得（正解ラベル）

            return image, torch.Tensor(question), torch.Tensor(answers), int(mode_answer_idx)

        else:
            return image, torch.Tensor(question)

    def __len__(self):
        return len(self.df)

### 改善版の場合

In [53]:
from transformers import BertTokenizer
import torch
from PIL import Image
import pandas as pd
import numpy as np
from statistics import mode

class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True, max_length=128):
        self.transform = transform
        self.image_dir = image_dir
        self.df = pd.read_json(df_path)
        self.answer = answer
        self.max_length = max_length

        # BERTトークナイザーの初期化
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # answerの辞書を作成
        self.answer2idx = {}
        self.idx2answer = {}

        if self.answer:
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}

    def update_dict(self, dataset):
        self.answer2idx = dataset.answer2idx
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image) if self.transform else image

        question = self.df["question"][idx]
        question = process_text(question)
    
        # BERTトークナイザーを使用して質問をエンコード
        question_encoded = self.tokenizer(question, 
                                          padding='max_length', 
                                          max_length=self.max_length, 
                                          truncation=True, 
                                          return_tensors="pt")
    
        # テンソルをスクイーズして余分な次元を削除
        question_encoded = {k: v.squeeze(0) for k, v in question_encoded.items()}

        if self.answer:
            answers = [self.answer2idx[process_text(answer["answer"])] for answer in self.df["answers"][idx]]
            mode_answer_idx = mode(answers)
            return image, question_encoded, torch.LongTensor(answers), torch.tensor(int(mode_answer_idx))
        else:
            return image, question_encoded

    def __len__(self):
        return len(self.df)

## 2. 評価指標の実装

In [54]:
# 簡単にするならBCEを利用する
def VQA_criterion(batch_pred: torch.Tensor, batch_answers: torch.Tensor):
    total_acc = 0.

    for pred, answers in zip(batch_pred, batch_answers):
        acc = 0.
        for i in range(len(answers)):
            num_match = 0
            for j in range(len(answers)):
                if i == j:
                    continue
                if pred == answers[j]:
                    num_match += 1
            acc += min(num_match / 3, 1)
        total_acc += acc / 10

    return total_acc / len(batch_pred)

## 3. モデルの実装

In [16]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        out += self.shortcut(residual)
        out = self.relu(out)

        return out


class BottleneckBlock(nn.Module):
    expansion = 4

    def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels * self.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels * self.expansion)
            )

    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))

        out += self.shortcut(residual)
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, layers):
        super().__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, layers[0], 64)
        self.layer2 = self._make_layer(block, layers[1], 128, stride=2)
        self.layer3 = self._make_layer(block, layers[2], 256, stride=2)
        self.layer4 = self._make_layer(block, layers[3], 512, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, 512)

    def _make_layer(self, block, blocks, out_channels, stride=1):
        layers = []
        layers.append(block(self.in_channels, out_channels, stride))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet50():
    return ResNet(BottleneckBlock, [3, 4, 6, 3])


class VQAModel(nn.Module):
    def __init__(self, vocab_size: int, n_answer: int):
        super().__init__()
        self.resnet = ResNet50()
        self.text_encoder = nn.Linear(vocab_size, 512)

        self.fc = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, n_answer)
        )

    def forward(self, image, question):
        image_feature = self.resnet(image)  # 画像の特徴量
        question_feature = self.text_encoder(question)  # テキストの特徴量

        x = torch.cat([image_feature, question_feature], dim=1)
        x = self.fc(x)

        return x

In [16]:
import torch
import torch.nn as nn
import torchvision.models as models

class VQARes101Model(nn.Module):
    def __init__(self, vocab_size: int, n_answer: int):
        super().__init__()
        # 画像特徴抽出モデル（ResNet101に変更）
        self.resnet = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
        self.resnet.fc = nn.Sequential(
            nn.Linear(self.resnet.fc.in_features, 512)
        )
        
        self.text_encoder = nn.Linear(vocab_size, 512)

        self.fc = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, n_answer)
        )

    def forward(self, image, question):
        image_feature = self.resnet(image)  # 画像の特徴量
        question_feature = self.text_encoder(question)  # テキストの特徴量

        x = torch.cat([image_feature, question_feature], dim=1)
        x = self.fc(x)

        return x

In [63]:
import torch
import torch.nn as nn
from transformers import ViTModel, BertModel

class ImprovedVQAModel(nn.Module):
    def __init__(self, n_answer):
        super().__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224', add_pooling_layer=False)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # ViTとBERTの出力サイズを取得
        self.vit_dim = self.vit.config.hidden_size
        self.bert_dim = self.bert.config.hidden_size
        
        # 特徴量のサイズを調整するための線形層
        self.vit_proj = nn.Linear(self.vit_dim, 768)
        self.bert_proj = nn.Linear(self.bert_dim, 768)
        
        # Fusion and classification layers
        self.fusion = nn.MultiheadAttention(768, 8)
        self.classifier = nn.Linear(768, n_answer)

    def forward(self, image, question_encoded):
        # Image encoding
        image_features = self.vit(image).last_hidden_state
        image_features = self.vit_proj(image_features)
        
        # Question encoding
        question_features = self.bert(**{k: v.squeeze(1) for k, v in question_encoded.items()}).last_hidden_state
        question_features = self.bert_proj(question_features)
        
        # Ensure the sequence length is the same for both features
        max_len = max(image_features.size(1), question_features.size(1))
        image_features = F.pad(image_features, (0, 0, 0, max_len - image_features.size(1)))
        question_features = F.pad(question_features, (0, 0, 0, max_len - question_features.size(1)))
        
        # Fusion
        fused_features, _ = self.fusion(image_features, question_features, question_features)
        
        # Classification
        output = self.classifier(fused_features.mean(dim=1))
        
        return output

## 4. 学習の実装

In [9]:
import time
import torch
import gc
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

def train(model, dataloader, optimizer, criterion, device):
    model.train()

    total_loss = 0
    total_acc = 0
    simple_acc = 0
    n = 0

    start = time.time()
    for image, question, answers, mode_answer in dataloader:
        image, question, answer, mode_answer = \
            image.to(device), question.to(device), answers.to(device), mode_answer.to(device)

        optimizer.zero_grad()

        with autocast():
            pred = model(image, question)
            loss = criterion(pred, mode_answer.squeeze())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
        simple_acc += (pred.argmax(1) == mode_answer).float().mean().item()  # simple accuracy
        n += 1
        print(f" [{n}/{len(dataloader)}] ", end="\r")

        # メモリをクリア
        torch.cuda.empty_cache()
        gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start


def eval(model, dataloader, criterion, device):
    model.eval()

    total_loss = 0
    total_acc = 0
    simple_acc = 0

    start = time.time()
    with torch.no_grad():
        for image, question, answers, mode_answer in dataloader:
            image, question, answer, mode_answer = \
                image.to(device), question.to(device), answers.to(device), mode_answer.to(device)

            with autocast():
                pred = model(image, question)
                loss = criterion(pred, mode_answer.squeeze())

            total_loss += loss.item()
            total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
            simple_acc += (pred.argmax(1) == mode_answer).mean().item()  # simple accuracy

            # メモリをクリア
            torch.cuda.empty_cache()
            gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start


In [64]:
# 改善版

import time
import torch
import gc
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_acc = 0
    simple_acc = 0
    n = 0
    start = time.time()
    for image, question_encoded, answers, mode_answer in dataloader:
        image = image.to(device)
        question_encoded = {k: v.to(device) for k, v in question_encoded.items()}
        answers = answers.to(device)
        mode_answer = mode_answer.to(device)

        optimizer.zero_grad()
        with autocast():
            # デバッグ情報
            print(f"Image shape: {image.shape}")
            print(f"Question encoded shapes: {[(k, v.shape) for k, v in question_encoded.items()]}")
            
            pred = model(image, question_encoded)
            
            # デバッグ情報
            print(f"Pred shape: {pred.shape}")
            print(f"Mode answer shape: {mode_answer.shape}")
            
            loss = criterion(pred, mode_answer)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
        simple_acc += (pred.argmax(1) == mode_answer).float().mean().item()  # simple accuracy
        n += 1
        print(f" [{n}/{len(dataloader)}] ", end="\r")

        # メモリをクリア
        torch.cuda.empty_cache()
        gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start

def eval(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_acc = 0
    simple_acc = 0
    start = time.time()
    with torch.no_grad():
        for image, question_encoded, answers, mode_answer in dataloader:
            image = image.to(device)
            question_encoded = {k: v.to(device) for k, v in question_encoded.items()}
            answers = answers.to(device)
            mode_answer = mode_answer.to(device)

            with autocast():
                pred = model(image, question_encoded)
                loss = criterion(pred, mode_answer)

            total_loss += loss.item()
            total_acc += VQA_criterion(pred.argmax(1), answers)  # VQA accuracy
            simple_acc += (pred.argmax(1) == mode_answer).mean().item()  # simple accuracy

            # メモリをクリア
            torch.cuda.empty_cache()
            gc.collect()

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start

In [50]:
# deviceの設定
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

# dataloader / model
# 画像のデータ拡張
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # ランダムにリサイズしてクロップ
    transforms.RandomHorizontalFlip(),  # 水平方向にランダムに反転
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # カラージッタ
    transforms.RandomRotation(10),  # ランダムに回転
    transforms.ToTensor(),  # テンソルに変換
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 正規化
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = VQADataset(df_path="./data/train.json", image_dir="./data/train", transform=train_transform)
test_dataset = VQADataset(df_path="./data/valid.json", image_dir="./data/valid", transform=test_transform, answer=False)
test_dataset.update_dict(train_dataset)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [65]:
#改善版

# deviceの設定
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

# 画像のデータ拡張
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # ランダムにリサイズしてクロップ
    transforms.RandomHorizontalFlip(),  # 水平方向にランダムに反転
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # カラージッタ
    transforms.RandomRotation(10),  # ランダムに回転
    transforms.ToTensor(),  # テンソルに変換
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 正規化
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# データセットの初期化
train_dataset = VQADataset(df_path="./data/train.json", image_dir="./data/train", transform=train_transform)
test_dataset = VQADataset(df_path="./data/valid.json", image_dir="./data/valid", transform=test_transform, answer=False)
test_dataset.update_dict(train_dataset)

# データローダーの初期化
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# モデルの初期化
model = ImprovedVQAModel(n_answer=len(train_dataset.answer2idx)).to(device)

In [66]:
# model = VQAModel(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)
# model = VQARes101Model(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)
# model = ImprovedVQAModel(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx)).to(device)

# optimizer / criterion
num_epoch = 20
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [37]:
# train model
for epoch in range(num_epoch):
    train_loss, train_acc, train_simple_acc, train_time = train(model, train_loader, optimizer, criterion, device)
    print(f"【{epoch + 1}/{num_epoch}】\n"
            f"train time: {train_time:.2f} [s]\n"
            f"train loss: {train_loss:.4f}\n"
            f"train acc: {train_acc:.4f}\n"
            f"train simple acc: {train_simple_acc:.4f}")

# 提出用ファイルの作成
model.eval()
submission = []
for image, question in test_loader:
    image, question = image.to(device), question.to(device)
    pred = model(image, question)
    pred = pred.argmax(1).cpu().item()
    submission.append(pred)

submission = [train_dataset.idx2answer[id] for id in submission]
submission = np.array(submission)
torch.save(model.state_dict(), "model.pth")
np.save("submission.npy", submission)

ValueError: too many values to unpack (expected 2)

In [70]:
# 改善版
# train model
for epoch in range(num_epoch):
    train_loss, train_acc, train_simple_acc, train_time = train(model, train_loader, optimizer, criterion, device)
    print(f"【{epoch + 1}/{num_epoch}】\n"
            f"train time: {train_time:.2f} [s]\n"
            f"train loss: {train_loss:.4f}\n"
            f"train acc: {train_acc:.4f}\n"
            f"train simple acc: {train_simple_acc:.4f}")

# 提出用ファイルの作成
model.eval()
submission = []
with torch.no_grad():
    for image, question_encoded in test_loader:
        image = image.to(device)
        question_encoded = {k: v.to(device) for k, v in question_encoded.items()}
        
        pred = model(image, question_encoded)
        pred = pred.argmax(1).cpu().tolist()  # バッチ処理に対応
        submission.extend(pred)

submission = [train_dataset.idx2answer[id] for id in submission]
submission = np.array(submission)
torch.save(model.state_dict(), "model.pth")
np.save("submission.npy", submission)

Image shape: torch.Size([32, 3, 224, 224])
Question encoded shapes: [('input_ids', torch.Size([32, 128])), ('token_type_ids', torch.Size([32, 128])), ('attention_mask', torch.Size([32, 128]))]


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacty of 14.58 GiB of which 17.56 MiB is free. Process 1804 has 14.56 GiB memory in use. Of the allocated memory 14.02 GiB is allocated by PyTorch, and 387.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF