In [1]:
# !pip install transformers

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.models as models
from transformers import BertTokenizer, BertModel

from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd 
import numpy as np
import pickle
import re
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from PIL import Image
from IPython.display import display
import matplotlib as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# PREPROCESS

In [2]:
# def text_prepocessing(text):
  
#     lemmatizer = WordNetLemmatizer()
#     stop_words=set(stopwords.words('english'))
    
#     x = [ch for ch in text if ch not in string.punctuation]
#     text = ''.join(x)
#     clean = [word.lower() for word in text.split() if word.lower() not in stop_words]
#     text =  ' '.join(clean)
    
#     x = [lemmatizer.lemmatize(word) for word in text.split()]
#     text =  ' '.join(x)
    
#     return text 

In [3]:
# def create_processed_csv():
    
#     df = pd.read_csv('./clean1.csv')
    
#     for i in range(len(df)):
#         text = df.iloc[i]['raw']
#         text = text_prepocessing(text)
#         df.loc[i,'clean'] = text
        
#     df.to_csv('pre-processed.csv')

In [4]:
# create_processed_csv()

In [5]:
# pretrained_resnet = models.resnet152(pretrained=True)

# # Remove the last layer of the model
# modules = list(pretrained_resnet.children())[:-1]
# pretrained_resnet = nn.Sequential(*modules)

# # Set the model to evaluation mode
# pretrained_resnet.eval()

# print("Resnet Loaded !")

In [6]:
# pretrained_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_basic_tokenize=True)
# pretrained_bert = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)
# pretrained_bert.eval()
# print("Bert Loaded !")

In [7]:
# def get_image_features(img_path):
    
#     preprocess_image =   transforms.Compose([
#                                   transforms.Resize(256),
#                                   transforms.CenterCrop(224),
#                                   transforms.ToTensor(),
#                                   transforms.Normalize(
#                                     mean=[0.485, 0.456, 0.406],
#                                     std=[0.229, 0.224, 0.225])
#                               ])
        
#     img = Image.open(img_path).convert('RGB')
#     img_tensor = preprocess_image(img)
#     img_tensor = img_tensor.unsqueeze(0)

#     with torch.no_grad():
#         features = pretrained_resnet(img_tensor)

#     feature_tensor = features.squeeze()

#     return feature_tensor

In [8]:
# def get_word_embeddings(text,max_length=49):
    
#     rem_len = max_length - len(text.split())
    
#     for i in range(0,rem_len):
#         text = text + " PAD"

#     marked_text = "CLS " + text + " SEP"
        
#     tokenized_text = pretrained_tokenizer.basic_tokenizer.tokenize(marked_text)
#     indexed_tokens = pretrained_tokenizer.convert_tokens_to_ids(tokenized_text)
#     tokens_tensor = torch.tensor([indexed_tokens])
    
#     with torch.no_grad():
#         outputs = pretrained_bert(tokens_tensor)
#         # can use last hidden state as word embeddings
#         last_hidden_state = outputs[0]
#         word_embed_1 = last_hidden_state
        
#     return word_embed_1

In [9]:
# def save_embeddings():
   
#     df = pd.read_csv('./processed.csv')
#     embeddings_dict = {}

#     for index,text in enumerate(df['clean']):

#         embeddings = get_word_embeddings(text)
#         embeddings_dict[index] = embeddings

    # for key in embeddings.keys():
#     embeddings[key] = embeddings[key].flatten()

#     with open('text_embeddings.pickle', 'wb') as handle:
#         pickle.dump(embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
# def save_image_features():

#     BASE_PATH = './sentiment/sentiment_images/'
    
#     img_feature_dict = {}
#     for file_name in os.listdir(BASE_PATH):
#         img_path = BASE_PATH + file_name
#         img_tensor = get_image_features(img_path)
#         img_feature_dict[file_name] = img_tensor

#     with open('img_features.pickle', 'wb') as handle:
#         pickle.dump(img_feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# def get_max_length():
    
#     df = pd.read_csv("./processed.csv")
#     df = df[['clean']]
    
#     max_len = 0
    
#     for i,text in enumerate(df['clean']):
#         max_len = max(max_len,len(text.split()))
        
#     return max_len

In [12]:
# get_max_length()

In [38]:
with open('text_embeddings.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

with open('img_features.pickle', 'rb') as handle:
    img_features = pickle.load(handle)
    
# with open('normalized_embeddings.pkl', 'rb') as f:
#     normalized_embeddings = pickle.load(f)

In [14]:
# def get_n():
    
#     with open('text_embeddings.pickle', 'rb') as handle:
#         embeddings = pickle.load(handle)
    
#     e = torch.stack(list(embeddings.values()))
#     print(e[0])
#     mean = torch.mean(e,dim=0)
#     std = torch.std(e,dim=0)
#     normalized = (e-mean)/std
    
#     with open('normalized_embeddings.pkl', 'wb') as f:
#         pickle.dump(n, f)

In [15]:
#get_n()

In [16]:
class Custom_Dataset(Dataset):

    def __init__(self, csv_file, train = True, val = False):
        
        self.df = pd.read_csv(csv_file) #index_col = 0
        
        if(val):
            self.df = self.df[self.df['split']=='val']
            
        elif(train):
            self.df = self.df[self.df['split']=='train']
       
        else:
            self.df = self.df[self.df['split']=='test']
            
        cond = self.df[self.df['successful'] == 0].index
        self.df.drop(cond,inplace=True)
        
    def __len__(self):
        return len(self.df)

    def get_image_tensor(self,img_name):
            
        return img_features[img_name]
    
    def get_text_tensor(self,index):
            
        return embeddings[index]

    def __getitem__(self, index):

        img_name = self.df.iloc[index]['filename']
        img_tensor = self.get_image_tensor(img_name)
        
        sentiment = torch.tensor([self.df.iloc[index]['sentiment']],dtype=torch.float32)

        text_tensor = self.get_text_tensor(index)

        return img_tensor, text_tensor, sentiment

In [16]:
class NeuralNetwork(nn.Module):

    def __init__(self,img_input, text_input, output_num, device="cpu"):
        super().__init__()
        
        self.device = device

        self.img_network = nn.Sequential(
            
            nn.Linear(in_features = img_input, out_features = 1300,device=device),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features = 1300, out_features = 850,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 850, out_features = 300,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 300, out_features = 100,device=device)
        )

        self.text_network = nn.Sequential(
            nn.Linear(in_features = text_input, out_features = 25000,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 25000, out_features = 10000,device=device),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features = 10000, out_features = 4000,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 4000, out_features = 1000,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 1000, out_features = 400,device=device)
        )

        self.final_network = nn.Sequential(
            nn.Linear(in_features = 500, out_features = 350,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 350, out_features = 100,device=device),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features = 100, out_features = 50,device=device),
            nn.ReLU(),
            nn.Linear(in_features = 50, out_features = 1,device=device),
            nn.Sigmoid()
        )
        

    def forward(self, img,text):
        x_img = self.img_network(img)
        x_text = self.text_network(text)

        combined_tensor = torch.concat((x_img,x_text), dim = 0) 
        output = torch.tensor(self.final_network(combined_tensor))

        return output

In [21]:
training_data = Custom_Dataset(csv_file='./processed.csv', train = True)
# testing_data = Custom_Dataset(csv_file='./processed.csv',train = False)

In [22]:
train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
# test_dataloader = DataLoader(testing_data, batch_size=32, shuffle=False)

In [17]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

network = NeuralNetwork(2048,39168,1,device)

# network.load_state_dict(torch.load('./sentence_level_model.pt'))
# network.eval()

In [18]:
print(network)

NeuralNetwork(
  (img_network): Sequential(
    (0): Linear(in_features=2048, out_features=1300, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=1300, out_features=850, bias=True)
    (4): ReLU()
    (5): Linear(in_features=850, out_features=300, bias=True)
    (6): ReLU()
    (7): Linear(in_features=300, out_features=100, bias=True)
  )
  (text_network): Sequential(
    (0): Linear(in_features=39168, out_features=25000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=25000, out_features=10000, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
    (5): Linear(in_features=10000, out_features=4000, bias=True)
    (6): ReLU()
    (7): Linear(in_features=4000, out_features=1000, bias=True)
    (8): ReLU()
    (9): Linear(in_features=1000, out_features=400, bias=True)
  )
  (final_network): Sequential(
    (0): Linear(in_features=500, out_features=350, bias=True)
    (1): ReLU()
    (2): Linear(in_features=350, out_feature

In [19]:
for name, param in network.named_parameters():
    print(f"Parameter {name}, shape {param.shape}")

Parameter img_network.0.weight, shape torch.Size([1300, 2048])
Parameter img_network.0.bias, shape torch.Size([1300])
Parameter img_network.3.weight, shape torch.Size([850, 1300])
Parameter img_network.3.bias, shape torch.Size([850])
Parameter img_network.5.weight, shape torch.Size([300, 850])
Parameter img_network.5.bias, shape torch.Size([300])
Parameter img_network.7.weight, shape torch.Size([100, 300])
Parameter img_network.7.bias, shape torch.Size([100])
Parameter text_network.0.weight, shape torch.Size([25000, 39168])
Parameter text_network.0.bias, shape torch.Size([25000])
Parameter text_network.2.weight, shape torch.Size([10000, 25000])
Parameter text_network.2.bias, shape torch.Size([10000])
Parameter text_network.5.weight, shape torch.Size([4000, 10000])
Parameter text_network.5.bias, shape torch.Size([4000])
Parameter text_network.7.weight, shape torch.Size([1000, 4000])
Parameter text_network.7.bias, shape torch.Size([1000])
Parameter text_network.9.weight, shape torch.Size

In [21]:
optimizer = torch.optim.SGD(network.parameters(), lr=0.001)
# optimizer = torch.optim.Adam(network.parameters(),lr=0.001)
loss_fn = nn.BCELoss()

In [23]:
epoch_loss = 0

for epoch in range(0,30):
    image_batch , text_batch , target_batch = next(iter(train_dataloader))
    
    for image,text,target in zip(image_batch,text_batch,target_batch):
        
        out = network(image.to(device),text.to(device))
        
        out.requires_grad_(True)

        loss = loss_fn(out, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    print(f'Epoch No: {epoch+1}, running loss: {epoch_loss}')
    print(f'Epoch No: {epoch+1}, average loss: {epoch_loss/32}')
        
    epoch_loss = 0  

Epoch No: 1, running loss: 32.29554855823517
Epoch No: 1, average loss: 0.696735892444849
Epoch No: 2, running loss: 32.650769114494324
Epoch No: 2, average loss: 0.7078365348279476
Epoch No: 3, running loss: 32.237040996551514
Epoch No: 3, average loss: 0.6949075311422348
Epoch No: 4, running loss: 31.715420126914978
Epoch No: 4, average loss: 0.7098568789660931
Epoch No: 5, running loss: 31.86950546503067
Epoch No: 5, average loss: 0.7146720457822084
Epoch No: 6, running loss: 32.148109197616577
Epoch No: 6, average loss: 0.692128412425518
Epoch No: 7, running loss: 31.476314902305603
Epoch No: 7, average loss: 0.7023848406970501
Epoch No: 8, running loss: 29.367132782936096
Epoch No: 8, average loss: 0.698972899466753
Epoch No: 9, running loss: 29.113429069519043
Epoch No: 9, average loss: 0.6910446584224701
Epoch No: 10, running loss: 28.37119024991989
Epoch No: 10, average loss: 0.6990996953099966
Epoch No: 11, running loss: 28.28608077764511
Epoch No: 11, average loss: 0.69644002

In [24]:
# torch.save(network.state_dict(), "sentence_level_model.pt")

In [26]:
gt = []
pred= []

for epoch in range(0,50):
    image_batch , text_batch , target_batch = next(iter(test_dataloader))
    
    for image,text,target in zip(image_batch,text_batch,target_batch):
        
        out = network(image.to(device),text.to(device))
        
        if(out.item() >=0.5):
            pred.append(1.0)
        else: 
            pred.append(0.0)
            
        gt.append(target.item())
        
    print(f'Epoch No: {epoch+1}')

Epoch No: 1
Epoch No: 2
Epoch No: 3
Epoch No: 4
Epoch No: 5
Epoch No: 6
Epoch No: 7
Epoch No: 8
Epoch No: 9
Epoch No: 10



In [27]:
print(f'ACCURACY: {accuracy_score(gt,pred) * 100} ')
print(f'PRECISION: {precision_score(gt,pred) * 100}')
print(f'RECALL: {recall_score(gt,pred) * 100}')
print(f'F1 SCORE: {f1_score(gt,pred) * 100}')

ACCURACY: 74.125 
PRECISION: 73.125
RECALL: 100.0
F1 SCORE: 85.38775510204081


# TESTING

In [28]:
gt=[]
pred=[]

for i in range(len(testing_data)):
    
    img, text, label = testing_data[i]
    out = network(img,text)
    
    if(out.item() >= 0.5):
        pred.append(1.0)
    else:
        pred.append(0.0)
    
    gt.append(label.item())

In [29]:
print(f'ACCURACY: {accuracy_score(gt,pred) * 100} ')
print(f'PRECISION: {precision_score(gt,pred) * 100}')
print(f'RECALL: {recall_score(gt,pred) * 100}')
print(f'F1 SCORE: {f1_score(gt,pred) * 100}')

ACCURACY: 75.125 
PRECISION: 73.789
RECALL: 100.0
F1 SCORE: 86.38775510204081


In [None]:
# out= network(torch.stack(test_img_feat),torch.stack(test_txt_feat))