<a href="https://colab.research.google.com/github/Nikelroid/DeepLearning_Project/blob/main/Phase-1/Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading data onto the disk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/My Drive/deep_learning/Project

/content/drive/My Drive/deep_learning/Project


**Run this just for the first time**

In [None]:
!cp train_ende.zip .
!cp test.zip .
! git clone https://github.com/XL2248/MSCTD
!cp MSCTD/MSCTD_data/ende/english_*.txt .
!cp MSCTD/MSCTD_data/ende/image_index_*.txt .
!cp MSCTD/MSCTD_data/ende/sentiment_*.txt .
# !pip install --upgrade --no-cache-dir gdown
# !gdown --id 1GAZgPpTUBSfhne-Tp0GDkvSHuq6EMMbj
# !gdown --id 1B9ZFmSTqfTMaqJ15nQDrRNLqBvo-B39W
%%bash
for x in dataset/*.zip
do
  unzip -qq $x
done;
!mkdir dataset
!cd dataset; mkdir train test dev
!mv *train* dataset/train
!mv *test* dataset/test
!mv *dev* dataset/dev

cp: 'train_ende.zip' and './train_ende.zip' are the same file
cp: cannot stat 'test.zip': No such file or directory


#Dataset and Dataloader

In [None]:
import torch
from torchvision import transforms as T
from torchvision.io import read_image
from torch.utils.data import Dataset
import torch.nn as nn
from torchvision.models import efficientnet_b2, EfficientNet_B2_Weights

from PIL import Image
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import groupby
import seaborn as sns
# import linecache

In [None]:
class MSCTD_Dataset (Dataset):
  def __init__(self, dataset_dir, images_dir, conversation_dir, texts, sentiments, transform):
    self.dataset_path = Path(dataset_dir)
    self.images_path = self.dataset_path / images_dir
    self.sentiment_path = self.dataset_path / sentiments
    self.text_path = self.dataset_path / texts
    self.conversations_path = self.dataset_path / conversation_dir
    self.transform = transform

    with open(self.sentiment_path, 'r') as f:
      self.length = len(f.readlines())

    with open(self.text_path, 'r') as f:
        self.texts = f.read().splitlines()

    with open(self.sentiment_path, 'r') as f:
        self.sentiments = np.array(f.read().splitlines()).astype("int32")
    
    with open(self.conversations_path, 'r') as f:
        self.conversations = np.array(f.read().splitlines())
    
  def __len__(self):
        return self.length

  def __getitem__(self, idx):
        img_path = self.images_path / f'{idx}.jpg'
        image = np.divide(np.array(Image.open(img_path)),255)

        # image = read_image(str(img_path))
        if self.transform:
            image = self.transform(image)
       
        txt = self.texts[idx].strip()
        
        sentiment = self.sentiments[idx]

        data_dict = {"text":txt,
                     "image":image,
                     "sentiment":sentiment}
        return image,sentiment

In [None]:
import torchvision.transforms as transforms

transform = transforms.Compose([transforms.ToTensor()
                                ,transforms.Resize((288,288),transforms.InterpolationMode("bicubic"))
                                ,transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])])

trainset = MSCTD_Dataset('dataset/train', 'train_ende', 'image_index_train.txt', 'english_train.txt', 'sentiment_train.txt',transform)
testset = MSCTD_Dataset('dataset/test', 'test', 'image_index_test.txt', 'english_test.txt', 'sentiment_test.txt',transform)

In [None]:
image, sentiment = testset[10]


In [None]:
image

In [None]:

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {device} for inference')

Using cuda for inference


In [None]:
import torchvision
data_dir = './data'

train_loader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)

In [None]:
from tqdm import tqdm
def train_epoch(net: nn.Module, criterion: nn.Module, optimizer: torch.optim.Optimizer, dataloader: torch.utils.data.DataLoader,   accs_train ,loss_train):

    epoch_loss = 0
    epoch_true = 0
    epoch_all = 0
    i = 0

    net.train()
    optimizer.zero_grad()

    with tqdm(enumerate(dataloader), total=len(dataloader)) as pbar:
        for i, (x, y) in pbar: 
            x = x.to(device).float()
            y = y.to(device).to(torch.int64)
            
            p = net(x).float()
            loss = criterion(p, y)
            epoch_loss += float(loss)
            predictions = p.argmax(-1)
            epoch_all += len(predictions)
            epoch_true += (predictions == y).sum()
            pbar.set_description(f'Loss: {epoch_loss / (i + 1):.3e} - Acc: {epoch_true * 100. / epoch_all:.2f}%')
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
          
        accs_train.append(float(epoch_true / epoch_all))
        loss_train.append(float(epoch_loss / (i + 1)))
    return accs_train,loss_train

def eval_epoch(net: nn.Module, criterion: nn.Module, dataloader: torch.utils.data.DataLoader,    accs_test ,loss_test ):

    epoch_loss = 0
    epoch_true = 0
    epoch_true_topfive = 0
    epoch_all = 0
    i = 0

    net.eval()
    with torch.no_grad(), tqdm(enumerate(dataloader), total=len(dataloader)) as pbar:
        for i, (x,y) in pbar:
            
            x = x.to(device).float()
            y = y.to(device).to(torch.int64)
            p = net(x).float()
            loss = criterion(p, y)
            epoch_loss += float(loss)

            # predict 
            predictions = p.argmax(-1)
            epoch_all += len(predictions)
            epoch_true += (predictions == y).sum()

            pbar.set_description(f'Loss: {epoch_loss / (i + 1):.3e} - Acc: {epoch_true * 100. / epoch_all:.2f}% ')

        accs_test.append(float(epoch_true / epoch_all))
        loss_test.append(float(epoch_loss / (i + 1)))
    return accs_test,loss_test


In [None]:
class lastLayer(nn.Module):
    def __init__(self, pretrained):
        super(lastLayer, self).__init__()
        self.pretrained = pretrained
        self.last = nn.Sequential(
            nn.Dropout(p = 0.2,inplace=True),
            nn.Linear(1408, 90),
            nn.Dropout(p = 0.3,inplace=True),
            nn.Linear(90, 30),
            nn.Dropout(p = 0.1,inplace=True),
            nn.Linear(30, 3),
            )
    
    def forward(self, x):
        x = self.pretrained(x)
        x = self.last(x)
        return x



net = efficientnet_b2(weights=EfficientNet_B2_Weights.IMAGENET1K_V1)
net.classifier = nn.Sequential()

for param in net.parameters():
      param.requires_grad = False

net = lastLayer(net).to(device)
criterion = nn.CrossEntropyLoss().to(device)

print("Params to learn:")
params_to_update = []
for name,param in net.named_parameters():
    if param.requires_grad == True:
        params_to_update.append(param)
        print("\t",name)

optimizer = torch.optim.RMSprop(params_to_update, lr=2e-4)

epochs = 20
from time import time
accs_train = []
loss_train = []
accs_test = []
loss_test = []


for e in range(epochs):
    start_time = time()
    accs_train,loss_train = train_epoch(net, criterion, optimizer, train_loader,accs_train,loss_train)
    accs_test,loss_test = eval_epoch(net, criterion, test_loader,accs_test,loss_test)
    if accs_test[-1]==max(accs_test):
      torch.save(net.state_dict(), 'scene_modal_en.pth')
    end_time = time()

    print(f'Epoch {e+1:3} finished in {end_time - start_time:.2f}s')

plt.plot(np.array(loss_test), 'r')
plt.plot(np.array(loss_train), 'b')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Test', 'Train'])
plt.savefig('loss4.jpg')
plt.show()

plt.plot(np.array(accs_test), 'r')
plt.plot(np.array(accs_train), 'b')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Test', 'Train'])
plt.savefig('acc4.jpg')
plt.show()


print(f'Best Accuracy :{max(accs_test) * 100.:.2f}%')

Params to learn:
	 last.1.weight
	 last.1.bias
	 last.3.weight
	 last.3.bias
	 last.5.weight
	 last.5.bias


Loss: 1.088e+00 - Acc: 39.13%: 100%|██████████| 317/317 [1:22:54<00:00, 15.69s/it]
Loss: 1.093e+00 - Acc: 40.71% : 100%|██████████| 80/80 [34:45<00:00, 26.07s/it]


Epoch   1 finished in 7060.13s


Loss: 1.080e+00 - Acc: 40.72%: 100%|██████████| 317/317 [08:33<00:00,  1.62s/it]
Loss: 1.096e+00 - Acc: 39.04% : 100%|██████████| 80/80 [01:56<00:00,  1.45s/it]

Epoch   2 finished in 630.07s



Loss: 1.074e+00 - Acc: 41.82%: 100%|██████████| 317/317 [08:24<00:00,  1.59s/it]
Loss: 1.100e+00 - Acc: 36.33% : 100%|██████████| 80/80 [01:56<00:00,  1.45s/it]


Epoch   3 finished in 620.97s


Loss: 1.073e+00 - Acc: 42.08%: 100%|██████████| 317/317 [08:32<00:00,  1.62s/it]
Loss: 1.099e+00 - Acc: 39.75% : 100%|██████████| 80/80 [01:58<00:00,  1.48s/it]

Epoch   4 finished in 631.22s



Loss: 1.069e+00 - Acc: 42.50%: 100%|██████████| 317/317 [08:37<00:00,  1.63s/it]
Loss: 1.097e+00 - Acc: 40.32% : 100%|██████████| 80/80 [01:58<00:00,  1.48s/it]

Epoch   5 finished in 635.35s



Loss: 1.068e+00 - Acc: 42.82%: 100%|██████████| 317/317 [08:36<00:00,  1.63s/it]
Loss: 1.102e+00 - Acc: 39.16% : 100%|██████████| 80/80 [01:58<00:00,  1.48s/it]

Epoch   6 finished in 634.83s



Loss: 1.066e+00 - Acc: 42.87%: 100%|██████████| 317/317 [08:33<00:00,  1.62s/it]
Loss: 1.111e+00 - Acc: 37.25% :  62%|██████▎   | 50/80 [01:13<00:40,  1.35s/it]

# Part 3 quistion


No, This manner is not suitable for this task. It is because emotions and feelings in movies are related to the faces, the mood of voices, and conversations. Scan modes and emotions of scenes in the movies without considering characters' faces and conditions have not very good results in this task.</br>
This accuracy (43%) is more than base accuracy (33%) just because of the general situation and some parameters of an image like brightness and angle of the camera detected and trained on them.
This model uses EfficientNet_B2 with about 8M parameters and three layers with 90, 30, and 10 neurons.</br>
In the training process, after epoch 2, we have a strong overfit on the training dataset, so by intensely decreasing the accuracy of validation, we can say that we can have a better result in this model.