In [None]:
import os
import io
import requests
from PIL import Image
import pandas as pd

In [None]:
test_df = pd.read_csv("./test_clean.csv", index_col="Id")
test_df.head()

In [None]:
image_directory_test = "./drive/MyDrive/Factify_Images_Test/test"
for n, row in test_df.iterrows():
  test_df.iloc[n-1:n,0:1] = image_directory_test + "/claim/"  + str(n) + ".jpg"
  test_df.iloc[n-1:n,3:4] = image_directory_test + "/document/" + str(n) + ".jpg"

In [None]:
test_df.head()

In [None]:
category_to_ind = {
    'Support_Multimodal':0,
    'Support_Text':1,
    'Insufficient_Multimodal':2,
    'Insufficient_Text':3,
    'Refute':4
}
ind_to_category = {
    0 : 'Support_Multimodal',
    1 : 'Support_Text',
    2 : 'Insufficient_Multimodal',
    3 : 'Insufficient_Text',
    4 : 'Refute'
}

In [None]:
!pip install transformers

In [None]:
!pip install albumentations

In [None]:
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
import os
import cv2
from PIL import Image
import pdb
import time
import copy
import warnings
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader, Dataset, sampler
from matplotlib import pyplot as plt
#from albumentations import (HorizontalFlip,VerticalFlip,RandomScale,CenterCrop, Rotate, ShiftScaleRotate, Normalize, Resize, Compose, GaussNoise,RandomRotate90,Transpose,RandomBrightnessContrast,RandomCrop)
from albumentations.pytorch import ToTensorV2
import albumentations as A
import matplotlib.image as mpi
from pathlib import Path
from sklearn.metrics import recall_score,f1_score
from sklearn.model_selection import StratifiedKFold
import gc
warnings.filterwarnings("ignore")
seed = 53
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
class Dataset(Dataset):
    def __init__(self, df, mean, std, phase):
        self.df = df
        self.mean = mean
        self.std = std
        self.phase = phase
        self.transforms = get_transforms(phase, mean, std)
        self.fnames = self.df.index
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

    def __getitem__(self, idx):
        # Image data
        
        ## Claim Image
        image_claim_path = self.df['claim_image'].iloc[idx]
        img_claim = cv2.imread(image_claim_path)
        img_claim =  cv2.cvtColor(img_claim, cv2.COLOR_BGR2RGB)
        img_claim = self.transforms(image = img_claim)['image']
 
        ## Document Image
        image_document_path = self.df['document_image'].iloc[idx]
        img_document = np.array(Image.new('RGB',(256,256)))
        img_document =  cv2.cvtColor(img_document, cv2.COLOR_BGR2RGB)
        img_document = self.transforms(image = img_document)['image']

        # Text data

        ## Claim text
        claim_text = self.df['claim'].iloc[idx]
        if type(claim_text) != str:
          claim_text = " "
        encoded_text_claim = self.tokenizer(
            claim_text, 
            add_special_tokens=True,
            return_attention_mask=True, 
            pad_to_max_length=True, 
            max_length=64, 
            return_tensors='pt',
            truncation=True
        )
        encoded_text_claim["input_ids"] = encoded_text_claim["input_ids"].squeeze(0)
        encoded_text_claim["attention_mask"] = encoded_text_claim["attention_mask"].squeeze(0)
        encoded_text_claim["token_type_ids"] = encoded_text_claim["token_type_ids"].squeeze(0)

        ## Document Text
        document_text = self.df['document'].iloc[idx]
        if type(document_text) != str:
          document_text = " "
        encoded_text_document = self.tokenizer(
            document_text, 
            add_special_tokens=True,
            return_attention_mask=True, 
            pad_to_max_length=True, 
            max_length=64, 
            return_tensors='pt',
            truncation=True
        )
        encoded_text_document["input_ids"] = encoded_text_document["input_ids"].squeeze(0)
        encoded_text_document["attention_mask"] = encoded_text_document["attention_mask"].squeeze(0)
        encoded_text_document["token_type_ids"] = encoded_text_document["token_type_ids"].squeeze(0)

        # Label
        if self.phase == 'test':
          label = 0
        else:
          label = self.df['Category'].iloc[idx]
          label = torch.tensor(label, dtype=torch.long)

        inputs = {};
        inputs["text"] = (encoded_text_claim, encoded_text_document)
        inputs["images"] = (img_claim, img_document)
        inputs["labels"] = label

        return inputs

    def __len__(self):
        return len(self.fnames)

def get_transforms(phase, mean, std):
    list_transforms = []
    
    if phase == 'train':
        list_transforms.extend(
                  [
                    #A.SmallestMaxSize(max_size=256),
                    #A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.5),
                    #A.RandomCrop(height=256, width=256),
                    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
                    #A.RandomBrightnessContrast(p=0.5),
                    A.HorizontalFlip(),
                    A.Rotate(limit=10,p=.5)
                   ]
        )
    list_transforms.extend(
        [ 
            A.Resize(256,256,interpolation = 1),
            A.Normalize(mean=mean, std=std, p=1),
            ToTensorV2(),
        ]
    )
    list_trfms = A.Compose(list_transforms)
    return list_trfms
def provider(
    data_frame,
    phase,
    mean=None,
    std=None,
    batch_size=8,
    num_workers=0,
    split_size = 0.2
):
    '''Returns dataloader for the model training'''
    if phase == "test":
      df = data_frame
    else:
      label = data_frame['Category']
      train_df, val_df = train_test_split(data_frame, test_size=split_size,stratify=label)
      df = train_df if phase == "train" else val_df
    image_dataset = Dataset(df, mean, std, phase)
    is_train = False if phase == "test" else True
    dataloader = DataLoader(
        image_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=False,
        shuffle=False,   
    )
    return dataloader

In [None]:
test_data_loader = provider(
                test_df,
                phase='test',
                mean=(0.485, 0.456, 0.406),
                std=(0.229, 0.224, 0.225),
                batch_size=4,
                num_workers=8,
                split_size = 0.02
            )

In [None]:
!pip install efficientnet_pytorch

In [None]:
# Model definition and training

import torch.nn as nn
import transformers
from transformers import BertTokenizer, BertModel
import torch
from efficientnet_pytorch import EfficientNet

In [None]:
# Text_X2_Image_X2

class TwoInputTextModel(nn.Module):
    def __init__(self, final_layer_size):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased",
                                                           return_dict=False)
        self.bert_drop = nn.Dropout(0.1)
        self.out1 = nn.Linear(768, final_layer_size)
        self.norm = nn.BatchNorm1d(final_layer_size)
    
    def forward(self, inputs):
        output = [None] * 2

        _, bert_out1 = self.bert(**inputs[0])
        bert_out1 = self.bert_drop(bert_out1)
        output[0] = self.out1(bert_out1)
        output[0] = self.norm(output[0])
        
        _, bert_out2 = self.bert(**inputs[1])
        bert_out2 = self.bert_drop(bert_out2)
        output[1] = self.out1(bert_out2)
        output[1] = self.norm(output[1])
        
        return output

class TwoInputImgModel(nn. Module):
    def __init__(self, hidden_layer_size):
        super().__init__()
        self.effnet = EfficientNet.from_pretrained('efficientnet-b3')
        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
        self.drop = nn.Dropout(0.1)
        self.dense_layer1 = nn.Linear(1536, hidden_layer_size)
        self.norm2 = nn.BatchNorm1d(hidden_layer_size)

    def forward(self, inputs):
        output = [None] * 2
        
        eff_out1 = self.effnet.extract_features(inputs[0])
        eff_out1 = nn.Flatten()(self._avg_pooling(eff_out1))
        eff_out1 = self.drop(eff_out1)
        output[0] = self.dense_layer1(eff_out1)
        output[0] = self.norm2(output[0])
        
        eff_out2 = self.effnet.extract_features(inputs[1])
        eff_out2 = nn.Flatten()(self._avg_pooling(eff_out2))
        eff_out2 = self.drop(eff_out2)
        output[1] = self.dense_layer1(eff_out2)
        output[1] = self.norm2(output[1])
        
        return output

class BigModel(nn. Module):
    def __init__(self, hidden_layer_size_text, hidden_layer_size_image, num_classes):
        super().__init__()
        self.image_model = TwoInputImgModel(hidden_layer_size_image)
        self.text_model = TwoInputTextModel(hidden_layer_size_text)
        self.dense1 = nn.Linear((hidden_layer_size_image)*2, 256)
        self.dense2 = nn.Linear((hidden_layer_size_text)*2, 256)
        self.norm1 = nn.BatchNorm1d(256)
        self.norm2 = nn.BatchNorm1d(256)
        self.out = nn.Linear(256*2, num_classes)
    
    def forward(self, inputs):
        img_op1, img_op2 = self.image_model(inputs["images"])
        
        text_op1, text_op2 = self.text_model(inputs["text"])
        
        combined1 = torch.cat((img_op1.view(img_op1.size(0), -1),
                               text_op1.view(img_op2.size(0), -1)),
                              dim=1)
        
        combined2 = torch.cat((img_op2.view(img_op2.size(0),-1),
                              text_op2.view(text_op2.size(0), -1)),
                              dim=1)
        
        
        combined1 = self.dense1(combined1)
        combined1 = self.norm1(combined1)
        
        combined2 = self.dense2(combined2)
        combined2 = self.norm2(combined2)
        
        combined = torch.cat((combined1.view(combined1.size(0),-1),
                              combined2.view(combined2.size(0), -1)),
                              dim=1)
        
        final_output = self.out(combined)
        final_output = nn.Softmax()(final_output)
        return final_output

In [None]:
# Text_Image_X2

class TwoInputTextModel1(nn.Module):
    def __init__(self, final_layer_size):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased",
                                                           return_dict=False)
        self.bert_drop = nn.Dropout(0.1)
        self.out1 = nn.Linear(768, final_layer_size)
        self.norm = nn.BatchNorm1d(final_layer_size)
    
    def forward(self, inputs):
        output = [None] * 2

        _, bert_out1 = self.bert(**inputs[0])
        bert_out1 = self.bert_drop(bert_out1)
        output[0] = self.out1(bert_out1)
        output[0] = self.norm(output[0])
        
        _, bert_out2 = self.bert(**inputs[1])
        bert_out2 = self.bert_drop(bert_out2)
        output[1] = self.out1(bert_out2)
        output[1] = self.norm(output[1])
        
        return output

class TwoInputImgModel1(nn. Module):
    def __init__(self, hidden_layer_size):
        super().__init__()
        self.effnet = EfficientNet.from_pretrained('efficientnet-b3')
        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
        self.drop = nn.Dropout(0.1)
        self.dense_layer1 = nn.Linear(1536, hidden_layer_size)
        self.norm2 = nn.BatchNorm1d(hidden_layer_size)

    def forward(self, inputs):
        output = [None] * 2
        
        eff_out1 = self.effnet.extract_features(inputs[0])
        eff_out1 = nn.Flatten()(self._avg_pooling(eff_out1))
        eff_out1 = self.drop(eff_out1)
        output[0] = self.dense_layer1(eff_out1)
        output[0] = self.norm2(output[0])
        
        eff_out2 = self.effnet.extract_features(inputs[1])
        eff_out2 = nn.Flatten()(self._avg_pooling(eff_out2))
        eff_out2 = self.drop(eff_out2)
        output[1] = self.dense_layer1(eff_out2)
        output[1] = self.norm2(output[1])
        
        return output

class BigModel1(nn. Module):
    def __init__(self, hidden_layer_size_text, hidden_layer_size_image, num_classes):
        super().__init__()
        self.image_model = TwoInputImgModel1(hidden_layer_size_image)
        self.text_model = TwoInputTextModel1(hidden_layer_size_text)
        self.dense1 = nn.Linear((hidden_layer_size_image)*2, 256)
        self.dense2 = nn.Linear((hidden_layer_size_text)*2, 256)
        self.norm1 = nn.BatchNorm1d(256)
        self.norm2 = nn.BatchNorm1d(256)
        self.out = nn.Linear(256*2, num_classes)
    
    def forward(self, inputs):
        img_op1, img_op2 = self.image_model(inputs["images"])
        
        text_op1, text_op2 = self.text_model(inputs["text"])
        
        combined1 = torch.cat((img_op1.view(img_op1.size(0), -1),
                               img_op2.view(img_op2.size(0), -1)),
                              dim=1)
        
        combined2 = torch.cat((text_op1.view(text_op1.size(0),-1),
                              text_op2.view(text_op2.size(0), -1)),
                              dim=1)
        
        
        combined1 = self.dense1(combined1)
        combined1 = self.norm1(combined1)
        
        combined2 = self.dense2(combined2)
        combined2 = self.norm2(combined2)
        
        combined = torch.cat((combined1.view(combined1.size(0),-1),
                              combined2.view(combined2.size(0), -1)),
                              dim=1)
        
        final_output = self.out(combined)
        final_output = nn.Softmax()(final_output)
        return final_output

In [None]:
class F1_Loss(nn.Module):
    def __init__(self, epsilon=1e-7):
        super().__init__()
        self.epsilon = epsilon
        
    def forward(self, y_pred, y_true):
        assert y_pred.ndim == 2
        assert y_true.ndim == 1
        y_true = F.one_hot(y_true, 5).to(torch.float32)
        #y_pred = F.softmax(y_pred, dim=1)
        
        tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2* (precision*recall) / (precision + recall + self.epsilon)
        f1 = f1.clamp(min=self.epsilon, max=1-self.epsilon)
        return 1- f1.mean()
    
    #def forward(self, y_pred, y_true):
    #    return 1 - self.f1_score(y_pred, y_true);


In [None]:
from tqdm import tqdm

batch_size = 4
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    num_train_correct = 0
    loader = tqdm(data_loader, total=len(data_loader))
    total = 0;
    for data in loader:#tqdm(data_loader, total=len(data_loader)):
        for i in range(2):
            for k, v in data["text"][i].items():
                data["text"][i][k] = v.to(device)
            data["images"][i] = data["images"][i].to(device) 
        optimizer.zero_grad()
        op = model(data)
        data["labels"] = data["labels"].to(device)
        loss = F1_Loss().cuda()(op,data["labels"])
        loss.backward()
        optimizer.step()
#         scheduler.step(loss.item())
        final_loss += loss.item()
        total += 1
        num_train_correct  += (op.max(1)[1] == data["labels"]).sum().item()
        loader.set_description(f"loss - {final_loss/total}, acc - {num_train_correct/(total*batch_size)}")
    return final_loss / len(data_loader), num_train_correct/(len(data_loader)*batch_size)

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    num_train_correct = 0
    loader = tqdm(data_loader, total=len(data_loader))
    total = 0;
    for data in loader:#tqdm(data_loader, total=len(data_loader)):
        for i in range(2):
            for k, v in data["text"][i].items():
                data["text"][i][k] = v.to(device)
            data["images"][i] = data["images"][i].to(device)
        op = model(data)
        data["labels"] = data["labels"].to(device)
        loss = F1_Loss()(op,data["labels"])
        final_loss += loss.item()
        total += 1
        num_train_correct  += (op.max(1)[1] == data["labels"]).sum().item()
        loader.set_description(f"loss - {final_loss/total}, acc - {num_train_correct/(total*batch_size)}")
    return final_loss / len(data_loader), num_train_correct/(len(data_loader)*batch_size)

In [None]:
model = BigModel(256,256,5)
model1 = BigModel1(256,256,5)

In [None]:
best_loss = np.inf
batch_size = 4
device = "cuda"
model_save_path = '.'
model.to(device)
model1.to(device)
print("Variables Defined")

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

In [None]:
# num_train_steps = int((len(train_df)*0.8) / 4 * 1)
# optimizer = AdamW(optimizer_parameters, lr=3e-2)
# scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
#     )

optimizer = optim.Adam(optimizer_parameters, lr=3e-3)
        
scheduler = ReduceLROnPlateau(optimizer,factor=0.33, mode="min", patience=1, verbose=True)

In [None]:
best_loss = np.inf
model.to("cuda")
model.train()
for epoch in range(10):
    train_loss,train_acc = train_fn(train_data_loader, model, optimizer, "cuda", scheduler)
    test_loss,test_acc = eval_fn(val_data_loader, model, "cuda")
    scheduler.step(test_loss)
    if test_loss < best_loss:
      best_loss = test_loss
      torch.save(model, model_save_path + "/val_loss:" + str(test_loss) + ",epoch:" + str(epoch) + ".pth" )
    print(f"Train Loss = {train_loss} Train Acc = {train_acc} Valid Loss = {test_loss} Valid Acc = {test_acc}")

In [None]:
def test_fn(data_loader, model, device):
    model.eval()
    model1.eval()
    out = []
    final_loss = 0
    num_train_correct = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for i in range(2):
            for k, v in data["text"][i].items():
               data["text"][i][k] = v.to(device)
            data["images"][i] = data["images"][i].to(device)
        op = model(data)
        op1 = model1(data)
        op1.to('cpu')
        op.to('cpu')
        op = op + op1
        out.append(torch.max(op, 1)[1])
    return out

In [None]:
output = test_fn(test_data_loader, model, "cuda")

In [None]:
predictions = []
for x in output:
  predictions.extend(x.cpu().detach().numpy())
answer = pd.DataFrame(predictions, columns =['Category'])

In [None]:
answer.Category = answer.Category.map(ind_to_category)

In [None]:
answer.to_csv("answer.csv")