<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Multimodal/Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers

!pip install pytesseract
# !pip install pdf2image
!apt-get install poppler-utils 
!apt install tesseract-ocr

### Importing Dependencies

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re

import os
from pathlib import Path

import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling1D, BatchNormalization, Embedding, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import spacy
from unicodedata import normalize

from tqdm import tqdm

import pytesseract
from PIL import Image

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

import plotly.express as px

### Gathering Dataset

In [2]:
!unzip /content/drive/MyDrive/Deceptive-Research/dataset.zip

### Global Variables

In [3]:
img_shape = (224, 224)
deceptive_path = Path("/content/deceptive")
normal_path = Path("/content/Ads/")
uniq_labels = ["Non Deceptive", "Deceptive"]

### Cleaning the Data

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
def clean_text(text):

    text = normalize("NFKD", text) #Normalization

    text = re.sub(r"[^\w\s]","", text) #Remove Punc

    text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])

    text = re.sub("\s+", " ", text)

    text = text.strip()

    return text

In [6]:
def clean_img(fname):

  img = cv2.imread(fname)

  img = cv2.resize(img, img_shape) 

  # Normalization
  img = img/255.0

  return img

### OCR - Image to Text

In [7]:
def get_data(image):

  txt = pytesseract.image_to_string(Image.open(image), lang="eng")
  txt = re.sub("[\n]{2,}", "\t\t", txt)
  txt = re.sub("\n", "", txt)
  txt = re.sub("\t\t", "\n", txt)

  return txt

### Image Extraction

In [8]:
def extract_image(path, target):

  X1 = []
  X2 = []
  y = []

  for img in os.listdir(path):

    _, tail = os.path.splitext(img)

    if tail in [".jpg", ".jpeg", ".png"]:

      try:
        
        fname = f"{path}/{img}"

        # OCR - Image to Text        
        text = get_data(fname)

        # Cleaning the text
        text = clean_text(text)

        if not text:
          text = "No Information"
          print(fname)

        # Cleaning the Image
        img = clean_img(fname)

        X1.extend([img])
        X2.extend([text])
        y.extend([target])

      except Exception as e: print(f"Exception: {e}")

  return X1, X2, y

In [9]:
X1, X2, y = extract_image(deceptive_path, 1)

/content/deceptive/images (14).jpeg
/content/deceptive/images (15).jpeg
/content/deceptive/image (8).jpeg
/content/deceptive/image (9).jpeg
/content/deceptive/images.jpeg
/content/deceptive/image (12).jpeg
/content/deceptive/images.png
/content/deceptive/images (7).jpeg
/content/deceptive/images (2).jpeg
Exception: cannot identify image file '/content/deceptive/wss-notify-01.png'
/content/deceptive/image (6).jpeg
/content/deceptive/images (8).jpeg
/content/deceptive/images (1).png
/content/deceptive/image (11).jpeg
/content/deceptive/image.jpeg
/content/deceptive/images (1).jpeg
/content/deceptive/image (5).jpeg
/content/deceptive/images (4).png
/content/deceptive/image (7).jpeg
/content/deceptive/maxresdefault (2).jpg
/content/deceptive/images (12).jpeg
Exception: cannot identify image file '/content/deceptive/wss-exposed-04.png'
/content/deceptive/pop-up-ads-fake-zeus-virus_en.jpg
/content/deceptive/image (2).png
/content/deceptive/image (10).jpeg
/content/deceptive/image (3).png
/co

In [None]:
X2

In [11]:
for i in os.listdir(normal_path):
  X1_t, X2_t, y_t = extract_image(f"{normal_path}/{i}", 0)
  X1.extend(X1_t)
  X2.extend(X2_t)
  y.extend(y_t)

/content/Ads/7/13.png
/content/Ads/7/14.png
/content/Ads/19/15.png
/content/Ads/19/13.png
/content/Ads/19/11.png
/content/Ads/2/14.png
/content/Ads/4/15.png
/content/Ads/4/13.png
/content/Ads/5/13.png
/content/Ads/1/13.png
Exception: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

/content/Ads/6/13.png
/content/Ads/6/11.png
/content/Ads/6/12.png
Exception: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

/content/Ads/12/12.png
Exception: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

/content/Ads/18/15.png
Exception: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

/content/Ads/18/13.png
/content/Ads/18/12.png
/content/Ads/18/14.png
/content/Ads/15/13.png
/content/

In [12]:
X1 = np.array(X1)
# X2 = np.array(X2)

y = np.array(y, dtype="float32")

In [13]:
X1.shape, y.shape

((392, 224, 224, 3), (392,))

In [14]:
len_X2 = [len(x.split()) for x in X2]

In [15]:
px.box(len_X2)

### Tokenization

In [16]:
checkpoint = "bert-base-uncased"
# checkpoint = "gpt2"
sequence_length = 100

def tokenize(samples):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  if checkpoint == "gpt2" and tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
  )

  return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [17]:
X_tokenized = pd.DataFrame(tokenize(X2), columns=["input_ids", "attention_mask"])

### Splitting Data into Train/Test

In [18]:
X_tokenized_train, X_tokenized_test, X_img_train, X_img_test, y_train, y_test = train_test_split(X_tokenized, X1, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [19]:
X_tokenized_train

Unnamed: 0,input_ids,attention_mask
217,"[101, 25430, 10464, 15088, 3211, 12101, 7163, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
308,"[101, 4166, 2098, 3784, 5306, 1015, 2591, 1661...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
103,"[101, 4965, 4611, 11277, 12202, 9475, 2399, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,"[101, 2053, 2592, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230,"[101, 2053, 2592, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
100,"[101, 8827, 2549, 10122, 8942, 17007, 3511, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
273,"[101, 2147, 6739, 2813, 5527, 11142, 3131, 247...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
276,"[101, 2139, 16200, 4215, 21456, 2310, 2412, 85...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
277,"[101, 1062, 2423, 1060, 2871, 3461, 6701, 5649...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


### Unzipping the IDs and Masks 

In [20]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

X_tokenized_train, X_tokenized_test = unzip_x(X_tokenized_train), unzip_x(X_tokenized_test)

In [21]:
X_train = [X_img_train, X_tokenized_train]
X_test = [X_img_test, X_tokenized_test]

### Building the model

In [22]:
def build_model(img_shape, targets, checkpoint, sequence_length):

  # Loading the pre-trained Resnet model
  base_model_img = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights="imagenet")
  
  # Loading the pre-trained BERT model
  base_model_txt = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

  # Freezing the base model
  base_model_img.trainable = False

  input_ids = Input(shape=(sequence_length,), name="input_ids", dtype="int32")
  attenion_mask = Input(shape=(sequence_length,), name="attention_mask", dtype="int32")

  if checkpoint == "gpt2": x1 = base_model_txt.transformer(input_ids, attention_mask=attenion_mask)[0]
  else: x1 = base_model_txt.bert(input_ids, attention_mask=attenion_mask)[1]

  # Defining the custom head for our neural network
  global_average_layer = tf.keras.layers.GlobalAveragePooling2D()(base_model_img.output)

  x1 = Flatten()(x1)

  # Concat the results
  x = tf.concat([x1, global_average_layer], axis=-1, name="Concat")

  # Output layer
  units = len(np.unique(targets))

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1
  
  outputs = Dense(units=units, activation=activation)(x)

  model = Model(inputs=[base_model_img.input, input_ids, attenion_mask], outputs=outputs)

  # Compiling the model
  model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

  # Model Architecture Export
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, 
                            show_dtype=True, show_layer_names=True, rankdir='TB',
                            expand_nested=True, dpi=300, layer_range=None, 
                            show_layer_activations=True)

  
  return model

In [23]:
model = build_model((224, 224, 3), y, checkpoint, sequence_length)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.437086 to fit



### Callbacks

In [24]:
def callbacks() -> list:

    run_name = "run 1"
    save_path = Path("/content/drive/MyDrive/Deceptive-Research/models")
    os.makedirs(save_path/"logs", exist_ok=True)
    
    checkpoint = ModelCheckpoint(save_path, monitor="val_loss", save_best_only=True, 
                                                    verbose=1)

    earlystopping = EarlyStopping(monitor="val_loss", verbose=1, restore_best_weights = True,
                                                    patience=5)

    logger = TensorBoard(save_path/"logs"/run_name, histogram_freq=2, write_graph=True, write_images=True)

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, verbose=1,
                                         min_delta=0.0001, cooldown=0, min_lr=0)
    
    return [checkpoint, earlystopping, lr, logger]

In [25]:
#  !sudo rm -rf /content/drive/MyDrive/Deceptive-Research/models

### Model Training

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=8, callbacks=callbacks())

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.67898, saving model to /content/drive/MyDrive/Deceptive-Research/models


