In [None]:
# Code to mount the drive 
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/685/catr/'

/content/drive/.shortcut-targets-by-id/12c1zkm0_oa8VcOfsUa8Tn_YcYdyPpSlP/685/catr


In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 5.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 16.9 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 92.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.4 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 608 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |█████████████████████

In [None]:
!pip install -r requirements.txt



In [None]:
from transformers import ViTModel, ViTConfig, ViTFeatureExtractor,BertTokenizer,BertForMaskedLM
from sentence_transformers import SentenceTransformer
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import auc, precision_score, recall_score,roc_auc_score
import xgboost as xgb
from PIL import Image
import argparse
import glob
import json
import pandas as pd
import numpy as np
from models import caption
from datasets import coco, utils
from tqdm import tqdm
from configuration import Config
from xgboost import XGBClassifier
import os

In [None]:
def extract_vision_transformer_feats(image_path,n_select=None):
  """
  Function to extract the features from vision transformers
  """
  if not n_select:
    img_files_list = glob.glob(image_path+"*")
  else:
    img_files_list = glob.glob(image_path+"*")[:n_select]
  
  # Create image batch array for Vision Transformer
  img_batch = []
  for file in tqdm(img_files_list):
    img = np.asarray(Image.open(file))
    img_batch.append(img)
  print("Creation of image batches to be used for Vision Transformer complete")

  # Extract features from ViTModel
  feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/deit-small-distilled-patch16-224')
  model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-small-distilled-patch16-224')
  inputs              = feature_extractor(images=img_batch, return_tensors="pt")
  outputs             = model(**inputs)

  # Get image representations and their corresponding hashes i.e. get [CLS] token representation for each image
  img_representations = outputs.last_hidden_state[:,0,:]
  img_hash_li         = []
  for file in img_files_list:
    img_hash_li.append(file.split("/")[-1][:-4])
  
  # Create column names for image dimensions 
  col_img  = ["imdim_"+str(i) for i in list(range(768))]

  # Create a dataframe of image features
  img_data = pd.DataFrame(img_representations.detach().numpy(),columns = col_img)
  img_data['img_hash'] = img_hash_li

  return img_data

def extract_sentence_transformer_feats(reference_file_pth = '../emogen/Classifier/train/caption/',csv_file = 'train_caption.csv'):
  """
  Extract features for emotion related texts
  """
  # Get reference text data
  combined_path = reference_file_pth + csv_file
  #file = open(combined_path)
  #data_json = json.load(file)
  data = pd.read_csv(combined_path)#pd.DataFrame.from_dict(data_json['annotations']).reset_index(drop=True)
  try:
    del data['Unnamed: 0']
  except:
    pass
  
  # Use Sentence Transformer to extract features
  model = SentenceTransformer('all-mpnet-base-v2')
  sentence_embeddings = model.encode(data['comment'])
  col_text = ["tdim_"+str(i) for i in list(range(768))]

  # Create text feature dataframe
  text_data = pd.DataFrame(sentence_embeddings,columns=col_text)
  text_data['img_hash'] = data['image_hash']
  text_data['label'] = data['label']

  return text_data

def xgb_train_kfold(X_trn, y_trn,n_splits=5):
  """
  Perform training with XGBoost and evaluate in K-Fold cross-validation settings 
  """
  errors    = []
  precision = []
  recall    = []
  auc       = []
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=3815)

  for train_index, test_index in kf.split(X_trn):
     X_train_n, X_test_n = X_trn.values[train_index], X_trn.values[test_index]
     y_train_n, y_test_n = y_trn.values[train_index], y_trn.values[test_index]

     model = XGBClassifier(
         max_depth=4, n_estimators=300, random_state = 3815
     )
     model.fit(X_train_n, y_train_n)
     y_pred = model.predict(X_test_n)
     accuracy = (sum(y_pred == y_test_n))/len(y_test_n)
     errors.append(1 - accuracy)
     precision.append(precision_score(y_test_n,y_pred))
     recall.append(recall_score(y_test_n,y_pred))
     auc.append(roc_auc_score(y_test_n,y_pred))

  return errors, precision, recall, auc

In [None]:
img_data = extract_vision_transformer_feats(image_path= '../emogen/Classifier/train/images/',n_select=1250)
text_data = extract_sentence_transformer_feats()
# Concat data to for final dataset with multimodal features  
mmodal_data = img_data.merge(text_data,how='outer',on='img_hash')
#print(mmodal_data.columns)
labels = [int(x) for x in mmodal_data['label']]
# Create pseudo labels for interim 
#pseudo_labels = [np.random.randint(0,2) for i in range(len(mmodal_data))]
mmodal_data['Target']  = labels
###########################################################
y_trn = mmodal_data['Target']
X_trn = mmodal_data.drop(columns=['label','Target','img_hash'],axis=1)
errors, precision, recall, auc = xgb_train_kfold(X_trn,y_trn)
# Check average performances across K-Folds
print("The training errors on average is: ", np.round(np.mean(errors),4)*100)
print("The Precision on average is: ", np.round(np.mean(precision),4)*100)
print("The Recall on average is: ", np.round(np.mean(recall),4)*100)
print("The AUC Score on average is: ", np.round(np.mean(auc),4)*100)

  6%|▌         | 76/1250 [01:18<20:11,  1.03s/it]  
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-a583aaaaac62>", line 1, in <module>
    img_data = extract_vision_transformer_feats(image_path= '../emogen/Classifier/train/images/',n_select=1250)
  File "<ipython-input-6-91504a7a5b2a>", line 13, in extract_vision_transformer_feats
    img = np.asarray(Image.open(file))
  File "/usr/local/lib/python3.7/dist-packages/PIL/Image.py", line 2852, in open
    prefix = fp.read(16)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another ex

KeyboardInterrupt: ignored

In [None]:
# def xgb_train_kfold(X_trn, y_trn,n_splits=5):
#   """
#   Perform training with XGBoost and evaluate in K-Fold cross-validation settings 
#   """
#   errors    = []
#   precision = []
#   recall    = []
#   auc       = []
#   kf = KFold(n_splits=n_splits, shuffle=True, random_state=3815)

#   for train_index, test_index in kf.split(X_trn):
#      X_train_n, X_test_n = X_trn.values[train_index], X_trn.values[test_index]
#      y_train_n, y_test_n = y_trn.values[train_index], y_trn.values[test_index]

#      model = XGBClassifier(
#          max_depth=4, n_estimators=100, random_state = 3815
#      )
#      model.fit(X_train_n, y_train_n)
#      y_pred = model.predict(X_test_n)
#      accuracy = (sum(y_pred == y_test_n))/len(y_test_n)
#      errors.append(1 - accuracy)
#      precision.append(precision_score(y_test_n,y_pred))
#      recall.append(recall_score(y_test_n,y_pred))
#      auc.append(roc_auc_score(y_test_n,y_pred))

#   return errors, precision, recall, auc


# Assign image folder, image path and mmodel checkpoint path 
# image_path = '../emogen/train/'
# #img_name   = '0c4ff62871a904274fd41cee695d85f.jpg'
# #img_final_loc = image_path+img_name
# # Load image and perform pre-processing 
# #image = Image.open(img_final_loc)

# img_files_list = glob.glob(image_path+"*")[:50] # For test run 

# img_batch = []
# for file in tqdm(img_files_list):
#   img = np.asarray(Image.open(file))
#   img_batch.append(img)

# # Extract features
# feature_extractor   = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
# model               = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
# inputs              = feature_extractor(images=img_batch, return_tensors="pt")
# outputs             = model(**inputs)

# # Get image representations and their corresponding hashes 
# img_representations = outputs.last_hidden_state[:,0,:]
# img_hash_li         = []
# for file in img_files_list:
#   img_hash_li.append(file.split("/")[-1][:-4])

# # Get reference text data
# reference_file_pth = '../emogen/annotations/'
# json_file = 'captions_train.json'
# combined_path = reference_file_pth + json_file

# file = open(combined_path)
# data_json = json.load(file)
# data = pd.DataFrame.from_dict(data_json['annotations']).reset_index(drop=True)
# try:
#   del data['id']
# except:
#   pass
# model = SentenceTransformer('all-mpnet-base-v2')
# sentence_embeddings = model.encode(data['caption'])

# # Create column names 
# col_text = ["tdim_"+str(i) for i in list(range(768))]
# col_img  = ["imdim_"+str(i) for i in list(range(768))]
# # Create DataFrame for text features  
# text_data = pd.DataFrame(sentence_embeddings,columns=col_text)
# text_data['img_hash'] = data['image_id']
# # Create a dataframe of image features
# img_data = pd.DataFrame(img_representations.detach().numpy(),columns = col_img)
# img_data['img_hash'] = img_hash_li

# # Concat data to for final dataset with multimodal features  
# mmodal_data = img_data.merge(text_data,how='left',on='img_hash')

# # Create pseudo labels for interim 
# pseudo_labels = [np.random.randint(0,2) for i in range(len(mmodal_data))]
# mmodal_data['Target']  = pseudo_labels

# y_trn = mmodal_data['Target']
# X_trn = mmodal_data.drop(columns=['Target','img_hash'],axis=1)

# errors, precision, recall, auc = xgb_train_kfold(X_trn,y_trn)

# # Check average performances across K-Folds
# print("The training errors on average is: ", np.round(np.mean(errors),4)*100)
# print("The Precision on average is: ", np.round(np.mean(precision),4)*100)
# print("The Recall on average is: ", np.round(np.mean(recall),4)*100)
# print("The AUC Score on average is: ", np.round(np.mean(auc),4)*100)