In [23]:
import csv
import cv2
import numpy as np
import random
import os
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms, utils
import torchvision

In [24]:
TRAIN_PATH = "./captcha-hacker-2023-spring/dataset/train"
TEST_PATH = "./captcha-hacker-2023-spring/dataset/test"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [25]:
alphabets = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
alphabets2index = {alphabet:i for i, alphabet in enumerate(alphabets)}
alphabets_length = len(alphabets)


In [26]:
#one hot encode
def one_hot_encoding(alphabet):
    one_hot_vector = np.zeros(alphabets_length)
    idx = alphabets2index[alphabet]
    one_hot_vector[idx] = 1
    return one_hot_vector

In [27]:
##dataset1
class Task1Dataset(Dataset):
    def __init__(self, data, root, return_filename=False,mode = "train"):
        self.data = [sample for sample in data if sample[0].startswith("task1")]
        self.return_filename = return_filename
        self.root = root
        self.train_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                #transforms.RandomResizedCrop(size = (224, 224),scale=(0.98, 1.0)),
                transforms.ToTensor()])
        self.test_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                transforms.ToTensor()])
        
        self.mode = "train"
        
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = Image.open(f"{self.root}/{filename}")
        
        if self.mode == "train":
            img = self.train_transform(img)
        else:
            img = self.test_transform(img)
        
        label_all = np.zeros(alphabets_length)
        for index, a in enumerate(label):
            label_all = one_hot_encoding(a)
                
        if self.return_filename:
            return img, filename
        else:
            return img, label_all
        
    def __len__(self):
        return len(self.data)
##dataset2
class Task2Dataset(Dataset):
    def __init__(self, data, root, return_filename=False,mode = "train"):
        self.data = [sample for sample in data if sample[0].startswith("task2")]
        self.return_filename = return_filename
        self.root = root
        self.train_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                #transforms.RandomResizedCrop(size = (224, 224),scale=(0.98, 1.0)),
                transforms.ToTensor()])
        self.test_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                transforms.ToTensor()])
        
        self.mode = "train"
        
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = Image.open(f"{self.root}/{filename}")
        
        if self.mode == "train":
            img = self.train_transform(img)
        else:
            img = self.test_transform(img)
        
        label_all = np.zeros((alphabets_length,2))
        for index, a in enumerate(label):
            label_all[:,index] = one_hot_encoding(a)
                
        if self.return_filename:
            return img, filename
        else:
            return img, np.reshape(label_all,-1)
        
    def __len__(self):
        return len(self.data)

##dataset3
class Task3Dataset(Dataset):
    def __init__(self, data, root, return_filename=False,mode = "train"):
        self.data = [sample for sample in data if sample[0].startswith("task3")]
        self.return_filename = return_filename
        self.root = root
        self.train_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                #transforms.RandomResizedCrop(size = (224, 224),scale=(0.98, 1.0)),
                transforms.ToTensor()])
        self.test_transform = transforms.Compose([
                transforms.Resize([224, 224]),
                transforms.ToTensor()])
        
        self.mode = "train"
        
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = Image.open(f"{self.root}/{filename}")
        
        if self.mode == "train":
            img = self.train_transform(img)
        else:
            img = self.test_transform(img)
        
        label_all = np.zeros((alphabets_length,4))
        for index, a in enumerate(label):
            label_all[:,index] = one_hot_encoding(a)
                
        if self.return_filename:
            return img, filename
        else:
            return img, np.reshape(label_all,-1)
        
    def __len__(self):
        return len(self.data)

In [28]:
#hyperparameter
batch_size = 32

train_data = []
val_data = []

with open(f'{TRAIN_PATH}/annotations.csv', newline='') as csvfile:
    for row in csv.reader(csvfile, delimiter=','):
        if random.random() < 1.5:
            train_data.append(row)
        else:
            val_data.append(row)

train_ds = Task1Dataset(train_data, root=TRAIN_PATH, mode = "train")
train_dl = DataLoader(train_ds, batch_size=batch_size, drop_last=True, shuffle=True)

val_ds = Task1Dataset(val_data, root=TRAIN_PATH,mode = "val")
val_dl = DataLoader(val_ds, batch_size=1, drop_last=False, shuffle=False)

train_ds_2 = Task2Dataset(train_data, root=TRAIN_PATH, mode = "train")
train_dl_2 = DataLoader(train_ds_2, batch_size=batch_size, drop_last=True, shuffle=True)

val_ds_2 = Task2Dataset(val_data, root=TRAIN_PATH,mode = "val")
val_dl_2 = DataLoader(val_ds_2, batch_size=1, drop_last=False, shuffle=False)

train_ds_3 = Task3Dataset(train_data, root=TRAIN_PATH, mode = "train")
train_dl_3 = DataLoader(train_ds_3, batch_size=batch_size, drop_last=True, shuffle=True)

val_ds_3 = Task3Dataset(val_data, root=TRAIN_PATH,mode = "val")
val_dl_3 = DataLoader(val_ds_3, batch_size=1, drop_last=False, shuffle=False)

In [29]:
class ResNet(nn.Module):
    def __init__(self, Layer=18, word_count=1,Pretrained=True):
        super(ResNet, self).__init__()
        self.word_count = word_count
        
        if Layer==18:
            self.classify = nn.Linear(512, alphabets_length * word_count)
        if Layer==50:
            self.classify = nn.Linear(2048, alphabets_length * word_count)
        
        pretrained_model = torchvision.models.__dict__['resnet{}'.format(Layer)](pretrained=Pretrained)
        self.conv1 = pretrained_model._modules['conv1']
        self.bn1 = pretrained_model._modules['bn1']
        self.relu = pretrained_model._modules['relu']
        self.maxpool = pretrained_model._modules['maxpool']

        self.layer1 = pretrained_model._modules['layer1']
        self.layer2 = pretrained_model._modules['layer2']
        self.layer3 = pretrained_model._modules['layer3']
        self.layer4 = pretrained_model._modules['layer4']

        self.avgpool = nn.AdaptiveAvgPool2d(1)

        del pretrained_model

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        # print(x.shape)
        x = torch.flatten(x,start_dim=1)
        
        x = self.classify(x)

        return x

In [30]:
def set_paremeter_requires_grads(layer,is_required):
    if is_required:
        for param in layer.parameters():
            param.requires_grad = True
    else:
        for param in layer.parameters():
            param.requires_grad = False
            
def append_model_grad(model_grad,model):
    for param in model.parameters():
        if param.requires_grad == True:
            model_grad.append(param)

In [31]:

model = ResNet(Layer=50,word_count=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=7e-4)
loss_fn = nn.CrossEntropyLoss()

epochs = 300

min_loss = 999999999999999

for epoch in range(epochs):
    #pretrained
    ct = 0
    model_grad=[]
    
    for children in model.children():
        ct += 1
        if (ct > 2) & (epoch<10):
            set_paremeter_requires_grads(children,0)
        else:
            set_paremeter_requires_grads(children,1)   
            
    append_model_grad(model_grad,model)
        
    print(f"Epoch [{epoch}]")
    model.train()
    sum_loss = 0
    for image, label in train_dl:
        image = image.to(device)
        label = label.to(device)
        
        pred = model(image)
        loss = loss_fn(pred, label)
        sum_loss += loss  
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("loss: ",sum_loss)
    if min_loss > sum_loss:
        min_loss = sum_loss
        torch.save(model.state_dict(), "Task1_model_all3")
'''        
    sample_count = 0
    correct_count = 0
    model.eval()
    
    for image, label in val_dl:
        image = image.to(device)
        label = label.to(device)
        
        pred = model(image)
        loss = loss_fn(pred, label)
        pred = torch.argmax(pred, dim=1)
        label_argmax = torch.argmax(label, dim=1)

        sample_count += len(image)
        correct_count += (label_argmax == pred)
        
    print("accuracy (validation):", correct_count / sample_count)
'''

Epoch [0]
loss:  tensor(263.9333, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [1]
loss:  tensor(224.1207, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [2]
loss:  tensor(187.9024, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [3]
loss:  tensor(157.4590, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [4]
loss:  tensor(128.9881, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [5]
loss:  tensor(110.7874, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [6]
loss:  tensor(91.2885, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [7]
loss:  tensor(79.4662, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [8]
loss:  tensor(69.6095, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [9]
loss:  tensor(60.0956, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [10]
loss:  tensor(52.2927, device='cu

'        \n    sample_count = 0\n    correct_count = 0\n    model.eval()\n    \n    for image, label in val_dl:\n        image = image.to(device)\n        label = label.to(device)\n        \n        pred = model(image)\n        loss = loss_fn(pred, label)\n        pred = torch.argmax(pred, dim=1)\n        label_argmax = torch.argmax(label, dim=1)\n\n        sample_count += len(image)\n        correct_count += (label_argmax == pred)\n        \n    print("accuracy (validation):", correct_count / sample_count)\n'

In [32]:

model_2 = ResNet(Layer=50,word_count=2).to(device)
optimizer = torch.optim.Adam(model_2.parameters(), lr=7e-4)
loss_fn = nn.MultiLabelSoftMarginLoss()

epochs = 300

min_loss = 999999999999999

for epoch in range(epochs):
    #pretrained
    ct = 0
    model_2_grad=[]
    
    for children in model_2.children():
        ct += 1
        if (ct > 2) & (epoch<10):
            set_paremeter_requires_grads(children,0)
        else:
            set_paremeter_requires_grads(children,1)   
            
    append_model_grad(model_2_grad,model_2)
        
    print(f"Epoch [{epoch}]")
    model_2.train()
    sum_loss = 0
    for image, label in train_dl_2:
        image = image.to(device)
        label = label.to(device)
        
        pred = model_2(image)
        loss = loss_fn(pred, label)
        sum_loss += loss  
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("loss: ",sum_loss)
    if min_loss > sum_loss:
        min_loss = sum_loss
        torch.save(model_2.state_dict(), "Task2_model_all3")
'''    
    sample_count = 0
    correct_count = 0
    model_2.eval()
    
    for image, label in val_dl_2:
        image = image.to(device)
        label = label.to(device)
        
        pred = model_2(image)
        loss = loss_fn(pred, label)
        
        pred = torch.reshape(pred,(-1,2))
        label = torch.reshape(label,(-1,2))

        label_argmax = torch.argmax(label,dim = 0)
        pred_argmax = torch.argmax(pred,dim = 0)

        sample_count += len(image)
        correct_count += torch.equal(label_argmax,pred_argmax)
        
    print("accuracy (validation):", correct_count / sample_count)
'''

Epoch [0]
loss:  tensor(8.3506, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [1]
loss:  tensor(6.4151, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [2]
loss:  tensor(6.1616, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [3]
loss:  tensor(5.8396, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [4]
loss:  tensor(5.4959, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [5]
loss:  tensor(5.1462, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [6]
loss:  tensor(4.8552, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [7]
loss:  tensor(4.6116, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [8]
loss:  tensor(4.4101, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [9]
loss:  tensor(4.1917, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [10]
loss:  tensor(3.1795, device='cuda:0', dtype=torc

'    \n    sample_count = 0\n    correct_count = 0\n    model_2.eval()\n    \n    for image, label in val_dl_2:\n        image = image.to(device)\n        label = label.to(device)\n        \n        pred = model_2(image)\n        loss = loss_fn(pred, label)\n        \n        pred = torch.reshape(pred,(-1,2))\n        label = torch.reshape(label,(-1,2))\n\n        label_argmax = torch.argmax(label,dim = 0)\n        pred_argmax = torch.argmax(pred,dim = 0)\n\n        sample_count += len(image)\n        correct_count += torch.equal(label_argmax,pred_argmax)\n        \n    print("accuracy (validation):", correct_count / sample_count)\n'

In [33]:
model_3 = ResNet(Layer=50,word_count=4).to(device)
optimizer = torch.optim.Adam(model_3.parameters(), lr=7e-4)
loss_fn = nn.MultiLabelSoftMarginLoss()

epochs = 300

min_loss = 999999999999999

for epoch in range(epochs):
    #pretrained
    ct = 0
    model_3_grad=[]
    
    for children in model_3.children():
        ct += 1
        if (ct > 2) & (epoch<10):
            set_paremeter_requires_grads(children,0)
        else:
            set_paremeter_requires_grads(children,1)   
            
    append_model_grad(model_3_grad,model_3)
        
    print(f"Epoch [{epoch}]")
    model_3.train()
    sum_loss = 0
    for image, label in train_dl_3:
        image = image.to(device)
        label = label.to(device)
        pred = model_3(image)
        loss = loss_fn(pred, label)
        sum_loss += loss  
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("loss: ",sum_loss)
    if min_loss > sum_loss:
        min_loss = sum_loss
        torch.save(model_3.state_dict(), "Task3_model_all3")
'''    
    sample_count = 0
    correct_count = 0
    model_3.eval()
    
    for image, label in val_dl_3:
        image = image.to(device)
        label = label.to(device)
        
        pred = model_3(image)
        loss = loss_fn(pred, label)
        
        pred = torch.reshape(pred,(-1,4))
        label = torch.reshape(label,(-1,4))

        label_argmax = torch.argmax(label,dim = 0)
        pred_argmax = torch.argmax(pred,dim = 0)

        sample_count += len(image)
        correct_count += torch.equal(label_argmax,pred_argmax)
        
    print("accuracy (validation):", correct_count / sample_count)
'''

Epoch [0]
loss:  tensor(12.0011, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [1]
loss:  tensor(9.9670, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [2]
loss:  tensor(9.6687, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [3]
loss:  tensor(9.3591, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [4]
loss:  tensor(9.0318, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [5]
loss:  tensor(8.7730, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [6]
loss:  tensor(8.5095, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [7]
loss:  tensor(8.2898, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [8]
loss:  tensor(8.0820, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [9]
loss:  tensor(7.8825, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
Epoch [10]
loss:  tensor(6.6196, device='cuda:0', dtype=tor

'    \n    sample_count = 0\n    correct_count = 0\n    model_3.eval()\n    \n    for image, label in val_dl_3:\n        image = image.to(device)\n        label = label.to(device)\n        \n        pred = model_3(image)\n        loss = loss_fn(pred, label)\n        \n        pred = torch.reshape(pred,(-1,4))\n        label = torch.reshape(label,(-1,4))\n\n        label_argmax = torch.argmax(label,dim = 0)\n        pred_argmax = torch.argmax(pred,dim = 0)\n\n        sample_count += len(image)\n        correct_count += torch.equal(label_argmax,pred_argmax)\n        \n    print("accuracy (validation):", correct_count / sample_count)\n'