<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Multimodal/Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers

!pip install pytesseract
# !pip install pdf2image
!apt-get install poppler-utils 
!apt install tesseract-ocr

### Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re

import os
from pathlib import Path

import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling1D, BatchNormalization, Embedding, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import spacy
from unicodedata import normalize

from tqdm import tqdm

import pytesseract
from PIL import Image

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

### Gathering Dataset

In [None]:
!unzip /content/drive/MyDrive/Deceptive-Research/dataset.zip

### Global Variables

In [None]:
img_shape = (300, 300)
deceptive_path = Path("/content/deceptive")
normal_path = Path("/content/Ads/")
uniq_labels = ["Non Deceptive", "Deceptive"]

### Cleaning the Data

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
def clean_text(text):

    text = normalize("NFKD", text) #Normalization

    text = re.sub(r"[^\w\s]","", text) #Remove Punc

    text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])

    text = re.sub("\s+", " ", text)

    text = text.strip()

    return text

In [None]:
def clean_img(fname):

  img = cv2.imread(fname)

  img = cv2.resize(img, img_shape) 

  # Normalization
  img = img/255.0

  return img

### OCR - Image to Text

In [None]:
def get_data(image):

  txt = pytesseract.image_to_string(Image.open(image), lang="eng")
  txt = re.sub("[\n]{2,}", "\t\t", txt)
  txt = re.sub("\n", "", txt)
  txt = re.sub("\t\t", "\n", txt)

  return txt

### Image Extraction

In [None]:
def extract_image(path, target):

  X1 = []
  X2 = []
  y = []

  for img in os.listdir(path):

    _, tail = os.path.splitext(img)

    if tail in [".jpg", ".jpeg", ".png"]:

      try:
        
        fname = f"{path}/{img}"

        # OCR - Image to Text        
        text = get_data(fname)

        # Cleaning the text
        text = clean_text(text)

        if not text:
          text = "No Information"
          print(fname)

        # Cleaning the Image
        img = clean_img(fname)

        X1.extend([img])
        X2.extend([text])
        y.extend([target])

      except Exception as e: print(e)

  return X1, X2, y

In [None]:
X1, X2, y = extract_image(deceptive_path, 1)

In [None]:
X2

In [None]:
for i in os.listdir(normal_path):
  X1_t, X2_t, y_t = extract_image(f"{normal_path}/{i}", 0)
  X1.extend(X1_t)
  X2.extend(X2_t)
  y.extend(y_t)

/content/Ads/19/11.png
/content/Ads/19/13.png
/content/Ads/19/15.png
/content/Ads/11/11.png
/content/Ads/10/15.png
/content/Ads/10/12.png
/content/Ads/4/13.png
/content/Ads/4/15.png
/content/Ads/15/13.png
/content/Ads/3/11.png
/content/Ads/7/14.png
/content/Ads/7/13.png


In [None]:
X1 = np.array(X1)
# X2 = np.array(X2)

y = np.array(y, dtype="float32")

In [None]:
X1.shape, y.shape

((392, 300, 300, 3), (392,))

### Tokenization

In [None]:
# checkpoint = "bert-base-uncased"
checkpoint = "gpt2"
sequence_length = 256

def tokenize(samples):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  if checkpoint == "gpt2" and tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
  )

  return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [None]:
X_tokenized = pd.DataFrame(tokenize(X2), columns=["input_ids", "attention_mask"])

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


### Splitting Data into Train/Test

In [None]:
X_tokenized_train, X_tokenized_test, X_img_train, X_img_test, y_train, y_test = train_test_split(X_tokenized, X1, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
X_tokenized_train

Unnamed: 0,input_ids,attention_mask
217,"[40093, 1416, 1118, 911, 79, 300, 77, 50257, 5...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
308,"[23850, 785, 43528, 14413, 7324, 15699, 785, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
103,"[3856, 1572, 452, 264, 4105, 403, 50257, 50257...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15,"[44, 14208, 3961, 33482, 9742, 44314, 3163, 32...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
230,"[9032, 7772, 679, 13254, 3900, 31337, 856, 284...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
100,"[2949, 6188, 50257, 50257, 50257, 50257, 50257...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
273,"[74, 49412, 2584, 471, 27734, 3955, 6158, 3644...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
276,"[19006, 9712, 1041, 1395, 412, 3727, 304, 1249...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
277,"[22260, 573, 346, 50257, 50257, 50257, 50257, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Unzipping the IDs and Masks 

In [None]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

X_tokenized_train, X_tokenized_test = unzip_x(X_tokenized_train), unzip_x(X_tokenized_test)

In [None]:
X_train = [X_img_train, X_tokenized_train]
X_test = [X_img_test, X_tokenized_test]

### Building the model

In [None]:
def build_model(img_shape, targets, checkpoint, sequence_length):

  # Loading the pre-trained Resnet model
  base_model_img = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights="imagenet")
  
  # Loading the pre-trained BERT model
  base_model_txt = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

  # Freezing the base model
  base_model_img.trainable = False

  input_ids = Input(shape=(sequence_length,), name="input_ids", dtype="int32")
  attenion_mask = Input(shape=(sequence_length,), name="attention_mask", dtype="int32")

  if checkpoint == "gpt2": x1 = base_model_txt.transformer(input_ids, attention_mask=attenion_mask)[0]
  else: x1 = base_model_txt.bert(input_ids, attention_mask=attenion_mask)[1]

  # Defining the custom head for our neural network
  global_average_layer = tf.keras.layers.GlobalAveragePooling2D()(base_model_img.output)

  x1 = Flatten()(x1)

  # Concat the results
  x = tf.concat([x1, global_average_layer], axis=-1, name="Concat")

  # Output layer
  units = len(np.unique(targets))

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1
  
  outputs = Dense(units=units, activation=activation)(x)

  model = Model(inputs=[base_model_img.input, input_ids, attenion_mask], outputs=outputs)

  # Compiling the model
  model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

  # Model Architecture Export
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, 
                            show_dtype=True, show_layer_names=True, rankdir='TB',
                            expand_nested=True, dpi=300, layer_range=None, 
                            show_layer_activations=True)

  
  return model

In [None]:
model = build_model((300, 300, 3), y, checkpoint, sequence_length)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2ForSequenceClassification.

Some layers of TFGPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.437086 to fit

