<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Multimodal/Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers

!pip install pytesseract
# !pip install pdf2image
!apt-get install poppler-utils 
!apt install tesseract-ocr
# !pip install keras_nlp

### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re

import os
from pathlib import Path

import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling1D, BatchNormalization, Embedding, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# import keras_nlp

from sklearn.model_selection import train_test_split

import spacy
from unicodedata import normalize

from tqdm import tqdm

import pytesseract
from PIL import Image

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

import plotly.express as px

### Gathering Dataset

In [None]:
!unzip /content/drive/MyDrive/Deceptive-Research/dataset.zip

### Global Variables

In [3]:
img_shape = (224, 224)
deceptive_path = Path("/content/deceptive")
normal_path = Path("/content/Ads/")
uniq_labels = ["Non Deceptive", "Deceptive"]

### Cleaning the Data

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
def clean_text(text):

    text = normalize("NFKD", text) #Normalization

    text = re.sub(r"[^\w\s]","", text) #Remove Punc

    text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])

    text = re.sub("\s+", " ", text)

    text = text.strip()

    return text

In [6]:
def clean_img(fname):

  img = cv2.imread(fname)

  img = cv2.resize(img, img_shape) 

  # Normalization
  img = img/255.0

  return img

### OCR - Image to Text

In [7]:
def get_data(image):

  txt = pytesseract.image_to_string(Image.open(image), lang="eng")
  txt = re.sub("[\n]{2,}", "\t\t", txt)
  txt = re.sub("\n", "", txt)
  txt = re.sub("\t\t", "\n", txt)

  return txt

### Image Extraction

In [8]:
def extract_image(path, target):

  X1 = []
  X2 = []
  y = []

  for img in os.listdir(path):

    _, tail = os.path.splitext(img)

    if tail in [".jpg", ".jpeg", ".png"]:

      try:
        
        fname = f"{path}/{img}"

        # OCR - Image to Text        
        text = get_data(fname)

        # Cleaning the text
        text = clean_text(text)

        if not text:
          text = "No Information"
          print(fname)

        # Cleaning the Image
        img = clean_img(fname)

        X1.extend([img])
        X2.extend([text])
        y.extend([target])

      except Exception as e: print(f"Exception: {e}")

  return X1, X2, y

In [None]:
X1, X2, y = extract_image(deceptive_path, 1)

In [10]:
for i in os.listdir(normal_path):
  X1_t, X2_t, y_t = extract_image(f"{normal_path}/{i}", 0)
  X1.extend(X1_t)
  X2.extend(X2_t)
  y.extend(y_t)

/content/Ads/15/13.png
/content/Ads/3/11.png


In [11]:
X1 = np.array(X1)
# X2 = np.array(X2)

y = np.array(y, dtype="float32")

In [12]:
X1.shape, y.shape

((392, 224, 224, 3), (392,))

In [13]:
len_X2 = [len(x.split()) for x in X2]

In [14]:
px.box(len_X2)

### Tokenization

In [15]:
checkpoint = "bert-base-uncased"
# checkpoint = "gpt2"
sequence_length = 100

def tokenize(samples):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  if checkpoint == "gpt2" and tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
  )

  return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [16]:
X_tokenized = pd.DataFrame(tokenize(X2), columns=["input_ids", "attention_mask"])

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Splitting Data into Train/Test

In [17]:
X_tokenized_train, X_tokenized_test, X_img_train, X_img_test, y_train, y_test = train_test_split(X_tokenized, X1, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [18]:
X_tokenized_train

Unnamed: 0,input_ids,attention_mask
217,"[101, 4770, 2795, 5096, 7479, 10760, 16869, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
308,"[101, 2489, 7829, 7479, 13278, 9692, 9006, 248...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
103,"[101, 4497, 8913, 1053, 2669, 2080, 1052, 1034...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
15,"[101, 2053, 2592, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230,"[101, 2184, 12849, 10264, 2102, 2192, 5906, 37...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
...,...,...
100,"[101, 9686, 17443, 1023, 2243, 2290, 24707, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
273,"[101, 3046, 12882, 2382, 2154, 2489, 3711, 231...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
276,"[101, 3338, 2919, 3143, 2186, 11387, 16068, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
277,"[101, 10036, 22477, 14154, 9938, 2860, 2860, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


### Unzipping the IDs and Masks 

In [19]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

X_tokenized_train, X_tokenized_test = unzip_x(X_tokenized_train), unzip_x(X_tokenized_test)

In [20]:
X_train = [X_img_train, X_tokenized_train]
X_test = [X_img_test, X_tokenized_test]

### Building the model

In [25]:
def build_model(img_shape, targets, checkpoint, sequence_length):

  # Loading the pre-trained Resnet model
  base_model_img = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights="imagenet")
  
  # Loading the pre-trained BERT model
  base_model_txt = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

  # Freezing the base model
  base_model_img.trainable = False

  input_ids = Input(shape=(sequence_length,), name="input_ids", dtype="int32")
  attenion_mask = Input(shape=(sequence_length,), name="attention_mask", dtype="int32")

  if checkpoint == "gpt2": x1 = base_model_txt.transformer(input_ids, attention_mask=attenion_mask)[0]
  else: x1 = base_model_txt.bert(input_ids, attention_mask=attenion_mask)[1]

  # Defining the custom head for our neural network
  global_average_layer = tf.keras.layers.GlobalAveragePooling2D()(base_model_img.output)

  x1 = Flatten()(x1)

  # Concat the results

  concat_vec = tf.keras.layers.concatenate([x1, global_average_layer], name="Concat")
  # x = tf.concat([x1, global_average_layer], axis=-1, name="Concat")

  # # x = tf.keras.layers.Reshape(target_shape=((1, x.shape) + (1,)), name="Reshaping")(x)
  # expanded_concat = tf.keras.layers.Reshape((concat_vec.shape[1], 1))(concat_vec)

  # print(expanded_concat)
  
  # # Encoder Layer
  # x = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(expanded_concat)
  # x = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x)

  # # Squeeze
  # x = tf.squeeze(x, axis=-1)  

  # Output layer
  units = len(np.unique(targets))

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1
  
  outputs = Dense(units=units, activation=activation)(concat_vec)

  model = Model(inputs=[base_model_img.input, input_ids, attenion_mask], outputs=outputs)

  # Compiling the model
  model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

  # Model Architecture Export
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, 
                            show_dtype=True, show_layer_names=True, rankdir='TB',
                            expand_nested=True, dpi=300, layer_range=None, 
                            show_layer_activations=True)

  
  return model

In [26]:
model = build_model((224, 224, 3), y, checkpoint, sequence_length)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.437086 to fit



### Callbacks

In [27]:
def callbacks() -> list:

    run_name = "run 1"
    save_path = Path("/models")
    os.makedirs(save_path/"logs", exist_ok=True)
    
    checkpoint = ModelCheckpoint(save_path, monitor="val_loss", save_best_only=True, 
                                                    verbose=1)

    earlystopping = EarlyStopping(monitor="val_loss", verbose=1, restore_best_weights = True,
                                                    patience=5)

    logger = TensorBoard(save_path/"logs"/run_name, histogram_freq=2, write_graph=True, write_images=True)

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, verbose=1,
                                         min_delta=0.0001, cooldown=0, min_lr=0)
    
    return [checkpoint, earlystopping, lr, logger]

In [28]:
#  !sudo rm -rf /content/drive/MyDrive/Deceptive-Research/models

### Model Training

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=8, callbacks=callbacks())

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.86203, saving model to /models




Epoch 2/100
Epoch 2: val_loss improved from 0.86203 to 0.52592, saving model to /models




Epoch 3/100
Epoch 3: val_loss did not improve from 0.52592
Epoch 4/100
Epoch 4: val_loss improved from 0.52592 to 0.49075, saving model to /models




Epoch 5/100
Epoch 5: val_loss improved from 0.49075 to 0.47710, saving model to /models




Epoch 6/100
Epoch 6: val_loss did not improve from 0.47710
Epoch 7/100
Epoch 7: val_loss did not improve from 0.47710

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/100
Epoch 8: val_loss improved from 0.47710 to 0.45395, saving model to /models




Epoch 9/100
Epoch 9: val_loss improved from 0.45395 to 0.45102, saving model to /models




Epoch 10/100
Epoch 10: val_loss did not improve from 0.45102
Epoch 11/100
Epoch 11: val_loss improved from 0.45102 to 0.44909, saving model to /models




Epoch 12/100
Epoch 12: val_loss did not improve from 0.44909
Epoch 13/100
Epoch 13: val_loss improved from 0.44909 to 0.44678, saving model to /models




Epoch 14/100
Epoch 14: val_loss did not improve from 0.44678
Epoch 15/100
Epoch 15: val_loss improved from 0.44678 to 0.44475, saving model to /models




Epoch 16/100
Epoch 16: val_loss improved from 0.44475 to 0.44368, saving model to /models




Epoch 17/100
Epoch 17: val_loss improved from 0.44368 to 0.44351, saving model to /models




Epoch 18/100
Epoch 18: val_loss improved from 0.44351 to 0.44142, saving model to /models




Epoch 19/100
Epoch 19: val_loss improved from 0.44142 to 0.44019, saving model to /models




Epoch 20/100
Epoch 20: val_loss improved from 0.44019 to 0.43907, saving model to /models




Epoch 21/100
Epoch 21: val_loss improved from 0.43907 to 0.43809, saving model to /models




Epoch 22/100
Epoch 22: val_loss did not improve from 0.43809
Epoch 23/100
Epoch 23: val_loss improved from 0.43809 to 0.43746, saving model to /models




Epoch 24/100
Epoch 24: val_loss improved from 0.43746 to 0.43490, saving model to /models




Epoch 25/100
Epoch 25: val_loss did not improve from 0.43490
Epoch 26/100
Epoch 26: val_loss improved from 0.43490 to 0.43234, saving model to /models




Epoch 27/100
Epoch 27: val_loss improved from 0.43234 to 0.43189, saving model to /models




Epoch 28/100
Epoch 28: val_loss did not improve from 0.43189
Epoch 29/100
Epoch 29: val_loss improved from 0.43189 to 0.42927, saving model to /models




Epoch 30/100
Epoch 30: val_loss improved from 0.42927 to 0.42794, saving model to /models




Epoch 31/100
Epoch 31: val_loss improved from 0.42794 to 0.42672, saving model to /models




Epoch 32/100
Epoch 32: val_loss improved from 0.42672 to 0.42570, saving model to /models




Epoch 33/100
Epoch 33: val_loss improved from 0.42570 to 0.42487, saving model to /models




Epoch 34/100
Epoch 34: val_loss improved from 0.42487 to 0.42455, saving model to /models




Epoch 35/100
Epoch 35: val_loss did not improve from 0.42455
Epoch 36/100
Epoch 36: val_loss improved from 0.42455 to 0.42443, saving model to /models




Epoch 37/100
Epoch 37: val_loss improved from 0.42443 to 0.42060, saving model to /models




Epoch 38/100
Epoch 38: val_loss improved from 0.42060 to 0.41957, saving model to /models




Epoch 39/100
Epoch 39: val_loss did not improve from 0.41957
Epoch 40/100
Epoch 40: val_loss improved from 0.41957 to 0.41763, saving model to /models




Epoch 41/100
Epoch 41: val_loss improved from 0.41763 to 0.41700, saving model to /models




Epoch 42/100
Epoch 42: val_loss improved from 0.41700 to 0.41611, saving model to /models




Epoch 43/100
Epoch 43: val_loss improved from 0.41611 to 0.41472, saving model to /models




Epoch 44/100
Epoch 44: val_loss did not improve from 0.41472
Epoch 45/100
Epoch 45: val_loss improved from 0.41472 to 0.41316, saving model to /models




Epoch 46/100
Epoch 46: val_loss improved from 0.41316 to 0.41251, saving model to /models




Epoch 47/100
Epoch 47: val_loss improved from 0.41251 to 0.41122, saving model to /models




Epoch 48/100
Epoch 48: val_loss improved from 0.41122 to 0.41020, saving model to /models




Epoch 49/100
Epoch 49: val_loss improved from 0.41020 to 0.40973, saving model to /models




Epoch 50/100
Epoch 50: val_loss improved from 0.40973 to 0.40824, saving model to /models




Epoch 51/100
Epoch 51: val_loss improved from 0.40824 to 0.40770, saving model to /models




Epoch 52/100
Epoch 52: val_loss improved from 0.40770 to 0.40653, saving model to /models




Epoch 53/100
Epoch 53: val_loss improved from 0.40653 to 0.40611, saving model to /models




Epoch 54/100
Epoch 54: val_loss did not improve from 0.40611
Epoch 55/100
Epoch 55: val_loss did not improve from 0.40611

Epoch 55: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 56/100
Epoch 56: val_loss improved from 0.40611 to 0.40611, saving model to /models




Epoch 57/100
Epoch 57: val_loss improved from 0.40611 to 0.40455, saving model to /models




Epoch 58/100
Epoch 58: val_loss improved from 0.40455 to 0.40424, saving model to /models




Epoch 59/100
Epoch 59: val_loss improved from 0.40424 to 0.40388, saving model to /models




Epoch 60/100
Epoch 60: val_loss did not improve from 0.40388
Epoch 61/100
Epoch 61: val_loss improved from 0.40388 to 0.40382, saving model to /models





Epoch 61: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 62/100
Epoch 62: val_loss improved from 0.40382 to 0.40380, saving model to /models




Epoch 63/100
Epoch 63: val_loss improved from 0.40380 to 0.40379, saving model to /models





Epoch 63: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 64/100
Epoch 64: val_loss improved from 0.40379 to 0.40378, saving model to /models




Epoch 65/100
Epoch 65: val_loss improved from 0.40378 to 0.40378, saving model to /models





Epoch 65: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 66/100
Epoch 66: val_loss improved from 0.40378 to 0.40378, saving model to /models




Epoch 67/100
Epoch 67: val_loss improved from 0.40378 to 0.40378, saving model to /models





Epoch 67: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 68/100
Epoch 68: val_loss did not improve from 0.40378
Epoch 69/100
Epoch 69: val_loss did not improve from 0.40378

Epoch 69: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-10.
Epoch 70/100
Epoch 70: val_loss improved from 0.40378 to 0.40378, saving model to /models




Epoch 71/100
Epoch 71: val_loss did not improve from 0.40378

Epoch 71: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-11.
Epoch 72/100
Epoch 72: val_loss did not improve from 0.40378
Epoch 73/100
Epoch 73: val_loss did not improve from 0.40378

Epoch 73: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-12.
Epoch 74/100
Epoch 74: val_loss did not improve from 0.40378
Epoch 75/100
Epoch 75: val_loss did not improve from 0.40378
Restoring model weights from the end of the best epoch: 70.

Epoch 75: ReduceLROnPlateau reducing learning rate to 1.0000001044244145e-13.
