<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Multimodal/Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers

!pip install pytesseract
# !pip install pdf2image
!apt-get install poppler-utils 
!apt install tesseract-ocr

### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re

import os
from pathlib import Path

import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling1D, BatchNormalization, Embedding, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import spacy
from unicodedata import normalize

from tqdm import tqdm

import pytesseract
from PIL import Image

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

### Gathering Dataset

In [None]:
!unzip /content/drive/MyDrive/Deceptive-Research/dataset.zip

### Global Variables

In [2]:
img_shape = (300, 300)
deceptive_path = Path("/content/deceptive")
normal_path = Path("/content/Ads/")
uniq_labels = ["Non Deceptive", "Deceptive"]

### Cleaning the Data

In [3]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
def clean_text(text):

    text = normalize("NFKD", text) #Normalization

    text = re.sub(r"[^\w\s]","", text) #Remove Punc

    text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])

    text = re.sub("\s+", " ", text)

    text = text.strip()

    return text

In [5]:
def clean_img(fname):

  img = cv2.imread(fname)

  img = cv2.resize(img, img_shape) 

  # Normalization
  img = img/255.0

  return img

### OCR - Image to Text

In [6]:
def get_data(image):

  txt = pytesseract.image_to_string(Image.open(image), lang="eng")
  txt = re.sub("[\n]{2,}", "\t\t", txt)
  txt = re.sub("\n", "", txt)
  txt = re.sub("\t\t", "\n", txt)

  return txt

### Image Extraction

In [7]:
def extract_image(path, target):

  X1 = []
  X2 = []
  y = []

  for img in os.listdir(path):

    _, tail = os.path.splitext(img)

    if tail in [".jpg", ".jpeg", ".png"]:

      try:
        
        fname = f"{path}/{img}"

        # OCR - Image to Text        
        text = get_data(fname)

        # Cleaning the text
        text = clean_text(text)

        if not text:
          text = "No Information"
          print(fname)

        # Cleaning the Image
        img = clean_img(fname)

        X1.extend([img])
        X2.extend([text])
        y.extend([target])

      except Exception as e: print(e)

  return X1, X2, y

In [None]:
X1, X2, y = extract_image(deceptive_path, 1)

In [None]:
X2

In [10]:
for i in os.listdir(normal_path):
  X1_t, X2_t, y_t = extract_image(f"{normal_path}/{i}", 0)
  X1.extend(X1_t)
  X2.extend(X2_t)
  y.extend(y_t)

/content/Ads/19/11.png
/content/Ads/19/13.png
/content/Ads/19/15.png
/content/Ads/11/11.png
/content/Ads/10/15.png
/content/Ads/10/12.png
/content/Ads/4/13.png
/content/Ads/4/15.png
/content/Ads/15/13.png
/content/Ads/3/11.png
/content/Ads/7/14.png
/content/Ads/7/13.png


In [11]:
X1 = np.array(X1)
# X2 = np.array(X2)

y = np.array(y, dtype="float32")

In [12]:
X1.shape, X2.shape, y.shape

AttributeError: ignored

### Tokenization

In [None]:
# checkpoint = "bert-base-uncased"
checkpoint = "gpt2"
sequence_length = 256

def tokenize(samples):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  if checkpoint == "gpt2" and tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
  )

  return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [None]:
input