In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import torch.optim as optim
import cv2
import os
import gc
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## Caption Generator Model

In [2]:
class Encoder(nn.Module):
    def __init__(self, embed_size):
        super(Encoder, self).__init__()
        self.resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)


        for name, params in self.resnet.named_parameters():
            if "fc" in name:
                params.requires_grad = True
            else:
                params.requires_grad = False

    
    def forward(self, x):
        x = self.resnet(x)
        # x = self.dropout(self.relu(x))
        x = self.relu(x)
        return x


class Decoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embed_size, embed_size)
        self.output = nn.Linear(embed_size, vocab_size)
        self.relu = nn.ReLU()


    def forward(self, x, y_in):
        embeddings = self.dropout(self.embedding(y_in))
        outputs, (hidden, cell) = self.lstm(embeddings)
        last_timestep_output = outputs[:, -1]
        sum_vector = x + last_timestep_output
        x = self.dropout(self.relu(self.fc(sum_vector)))
        return self.output(x)



class CaptionGenerator(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size):
        super(CaptionGenerator, self).__init__()
        self.encoder = Encoder(embed_size)
        self.decoder = Decoder(embed_size, vocab_size, hidden_size)
    
    def forward(self, image, y_in):
        features = self.encoder(image)
        outputs = self.decoder(features, y_in)
        return outputs


In [3]:
MAX_LEN = 34
VOCAB_SIZE = 8767

In [4]:
model = CaptionGenerator(embed_size=256,
                         vocab_size=VOCAB_SIZE,
                         hidden_size=256)

## Veri setining yuklenmesi

In [5]:
X = torch.load('X.pt')
y = torch.load('y.pt')

In [6]:
X.shape, y.shape

(torch.Size([40455]), torch.Size([40455, 34]))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=42, test_size=0.2
)

In [8]:
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

((torch.Size([32364]), torch.Size([32364, 34])),
 (torch.Size([8091]), torch.Size([8091, 34])))

In [9]:
image_list = []
with open('image_list.txt', 'r') as f:
    while True:
        line = f.readline()
        if len(line) == 0:
            break
        image_list.append(str(line.split('\n')[0]))

## Raw veri setining evrilmesinde yardimci olacak fonksiyonlar

In [10]:
def load_img(filename):
    file_path = os.path.join('Images', filename)
    if os.path.isfile(file_path) == False:
        raise FileNotFoundError
    
    
    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img / 255 # normalize

    transform = transforms.Compose([transforms.ToTensor()])
    return transform(img)


In [11]:
def generate_y_input_output(x1, x2):
    x = []
    y_in = []
    y_out = []

    x2 = x2.tolist()

    start_token_index = x2.index(1)

    for i in range(start_token_index+1, len(x2)):
        _y_in = x2[start_token_index:i]
        _y_out = x2[i]

        padd_len = MAX_LEN - len(_y_in)
        _y_in = [0 for j in range(padd_len)] + _y_in

        x.append(load_img(image_list[x1]))
        y_in.append(_y_in)
        y_out.append(_y_out)
    
    return x, y_in, y_out


In [12]:
def generate_batch_y_input_output(x1, x2, device):
    x = []
    y_in = []
    y_out = []

    for _x1, _x2 in zip(x1, x2):
        _x, _y_in, _y_out = generate_y_input_output(_x1, _x2)
        x = x + _x
        y_in = y_in + _y_in
        y_out = y_out + _y_out
    

    x = np.asarray(x)
    x = torch.from_numpy(x).to(device).type(torch.float32)
    y_in = torch.tensor(y_in).to(device)
    y_out = torch.tensor(y_out).to(device)

    return x, y_in, y_out

## Model Egitimini yapacak fonksiyon

In [13]:
def evaluate(model, test_loader, device):
    
    model.eval()

    with torch.no_grad():

        losses = []

        print("Evaluating...")
        
        for data in tqdm(test_loader):
            x1, x2 = data
            x, y_in, y_out = generate_batch_y_input_output(x1, x2, device)
            outputs = model(x, y_in)
            loss = nn.functional.cross_entropy(outputs, y_out)
            losses.append(loss.item())
        
        return sum(losses) / len(losses)


In [14]:
def train(model, 
          X_train, y_train,
          X_val, y_val,
          epochs=5, 
          batch_size=8,
          device=torch.device('cpu'),
          learning_rate=1e-5):
    
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=batch_size, 
                                              shuffle=True)
    

    test_dataset = torch.utils.data.TensorDataset(X_val, y_val)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)


    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


    model = model.to(device)

    average_losses = [] # the average losses of each epoch
    validation_losses = [] # the validation losses after each epoch
    for i in range(epochs):

        model.train()

        print(f'Epoch {i+1}')
        print('---------------')

        losses = []
        for data in tqdm(train_loader):
            gc.collect()

            x1, x2 = data
            x, y_in, y_out = generate_batch_y_input_output(x1, x2, device)
            outputs = model(x, y_in)
            loss = nn.functional.cross_entropy(outputs, y_out)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        

        validation_loss = evaluate(model, test_loader, device)
        validation_losses.append(validation_loss)
        average_loss = sum(losses) / len(losses)
        average_losses.append(average_loss)
        print(f'The end of epoch {i+1}, the average loss is {average_loss}, the validation loss is {validation_loss}')

        # saving model in each epoch
        torch.save(model, 'model.pt')
    
    return average_losses, validation_losses


## Modelin Egitildigi yer

In [15]:
average_losses, validation_losses = train(
    model,
    X_train, y_train, X_test, y_test,
    epochs=1,
    batch_size=8,
    # device=torch.device('cuda'),
    device=torch.device('cpu'),
    learning_rate=0.01
)

Epoch 1
---------------


  0%|          | 1/4046 [00:24<28:03:12, 24.97s/it]

torch.Size([71, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1694, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1694, 0.0000],
        [0.0000, 0.0000, 0.4217,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.5084, 0.4316, 0.2372,  ..., 0.2734, 0.5556, 0.0000],
        [0.5084, 0.0000, 0.2372,  ..., 0.2734, 0.5556, 0.0000],
        [0.5084, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0279]],
       grad_fn=<MulBackward0>)


  0%|          | 2/4046 [00:48<27:15:54, 24.27s/it]

torch.Size([69, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4447],
        [0.0000, 0.0000, 0.0000,  ..., 0.8624, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.8624, 0.0000, 0.4447],
        ...,
        [0.5795, 0.0000, 0.0000,  ..., 0.0000, 0.5903, 0.0000],
        [0.5795, 0.0000, 0.8256,  ..., 0.0000, 0.5903, 0.0000],
        [0.0000, 0.0000, 0.8256,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 3/4046 [01:22<32:16:17, 28.74s/it]

torch.Size([94, 256])
tensor([[0.5623, 0.1360, 0.2896,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1360, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0984, 0.0000, 0.7667],
        [0.0000, 0.0000, 0.0000,  ..., 0.0984, 0.0000, 0.7667],
        [0.0000, 0.0000, 0.0000,  ..., 0.0984, 0.0000, 0.7667]],
       grad_fn=<MulBackward0>)


  0%|          | 4/4046 [01:52<32:54:36, 29.31s/it]

torch.Size([77, 256])
tensor([[0.3015, 0.0000, 0.1195,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1195,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1195,  ..., 0.1888, 0.0000, 0.0000],
        ...,
        [0.3476, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3836],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3476, 0.0000, 0.0000,  ..., 0.3992, 0.0000, 0.3836]],
       grad_fn=<MulBackward0>)


  0%|          | 5/4046 [02:26<34:28:13, 30.71s/it]

torch.Size([87, 256])
tensor([[0.3366, 0.0000, 0.6681,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2436, 0.0000,  ..., 0.1632, 0.0000, 0.4869],
        [0.3366, 0.2436, 0.6681,  ..., 0.0000, 0.4835, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4985, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 6/4046 [02:53<33:07:18, 29.51s/it]

torch.Size([68, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3841],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3841],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.3695, 0.0000, 0.0000,  ..., 0.1277, 0.0000, 0.1404],
        [0.3695, 0.0000, 0.0000,  ..., 0.1277, 0.0000, 0.0000],
        [0.3695, 0.0000, 0.0000,  ..., 0.1277, 0.0000, 0.1404]],
       grad_fn=<MulBackward0>)


  0%|          | 7/4046 [03:23<33:15:51, 29.65s/it]

torch.Size([82, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2308, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.2192, 0.2308, 0.0000],
        [0.0000, 0.0000, 0.1474,  ..., 0.0000, 0.2308, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2993, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2993, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 8/4046 [03:52<33:12:52, 29.61s/it]

torch.Size([81, 256])
tensor([[0.4558, 0.6381, 0.4634,  ..., 0.0000, 0.0000, 0.0094],
        [0.4558, 0.0000, 0.0000,  ..., 0.0000, 0.6874, 0.0000],
        [0.4558, 0.6381, 0.0000,  ..., 0.0000, 0.6874, 0.0094],
        ...,
        [0.0000, 0.0000, 0.4256,  ..., 0.0000, 0.0000, 0.0000],
        [0.3046, 0.1318, 0.4256,  ..., 0.0000, 0.0000, 0.2022],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 9/4046 [04:21<32:54:32, 29.35s/it]

torch.Size([77, 256])
tensor([[1.0315, 0.5775, 0.0000,  ..., 0.0000, 0.2030, 0.0000],
        [1.0315, 0.0000, 0.8254,  ..., 0.0296, 0.2030, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.1224, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1224, 0.0000, 0.1701,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 10/4046 [04:54<34:14:39, 30.55s/it]

torch.Size([92, 256])
tensor([[0.0000, 0.0000, 0.3014,  ..., 0.0000, 0.0000, 0.6114],
        [0.6166, 0.0000, 0.0000,  ..., 0.0000, 0.2594, 0.6114],
        [0.6166, 0.0000, 0.3014,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.3384,  ..., 0.0000, 0.1338, 0.0000],
        [0.9507, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.9507, 0.0000, 0.3384,  ..., 0.0000, 0.1338, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 11/4046 [05:24<34:01:35, 30.36s/it]

torch.Size([83, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1070, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.1070, 0.0000, 0.0000],
        [0.6763, 0.0000, 0.0445,  ..., 0.1070, 0.0000, 0.0000],
        ...,
        [0.7001, 0.0000, 0.1695,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6296, 0.1695,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1695,  ..., 0.0000, 0.2925, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 12/4046 [05:55<34:17:50, 30.61s/it]

torch.Size([78, 256])
tensor([[0.0298, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3451],
        [0.0298, 0.0000, 0.0000,  ..., 0.0000, 0.5982, 0.3451],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5982, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4146, 0.6630],
        [0.6215, 0.0000, 0.3785,  ..., 0.0000, 0.4146, 0.0000],
        [0.6215, 0.0000, 0.0000,  ..., 0.0000, 0.4146, 0.6630]],
       grad_fn=<MulBackward0>)


  0%|          | 13/4046 [06:30<35:28:51, 31.67s/it]

torch.Size([81, 256])
tensor([[0.2610, 0.0098, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2610, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2610, 0.0098, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.6273],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5695, 0.6273],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 14/4046 [07:05<36:43:02, 32.78s/it]

torch.Size([79, 256])
tensor([[0.6785, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1379],
        [0.6785, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1379],
        ...,
        [0.0000, 0.0000, 0.1113,  ..., 0.0000, 0.0000, 0.0000],
        [0.1302, 0.0000, 0.1113,  ..., 0.1813, 0.0000, 0.1095],
        [0.1302, 0.0000, 0.1113,  ..., 0.0000, 0.0000, 0.1095]],
       grad_fn=<MulBackward0>)


  0%|          | 15/4046 [07:35<35:50:13, 32.01s/it]

torch.Size([79, 256])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4805],
        [0.0000, 0.1897, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4805],
        ...,
        [0.2277, 0.0097, 0.0000,  ..., 0.0000, 0.0000, 0.2822],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3257, 0.0000],
        [0.2277, 0.0097, 0.0000,  ..., 0.0000, 0.0000, 0.2822]],
       grad_fn=<MulBackward0>)


  0%|          | 16/4046 [08:02<34:11:32, 30.54s/it]

torch.Size([74, 256])
tensor([[0.1324, 0.0000, 0.0000,  ..., 0.0000, 0.1279, 0.1426],
        [0.1324, 0.0000, 0.0000,  ..., 0.0000, 0.1279, 0.0000],
        [0.0000, 0.0000, 0.0312,  ..., 0.0000, 0.1279, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.5059, 0.1286, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.6121, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


  0%|          | 16/4046 [08:32<35:52:08, 32.04s/it]


KeyboardInterrupt: 

: 

In [None]:
plt.plot(average_losses, color='blue')
plt.plot(validation_losses, color='purple')
plt.title('losses')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['average_loss', 'validation_loss'])
plt.show()

## Modeli C++ formatinda kaydetme

In [None]:
model_scripted = torch.jit.script(model)
model_scripted.save('model_scripted.pt')