<a href="https://colab.research.google.com/github/Satwikram/AI-Tutoring/blob/main/NLP/FineTuning%20BERT%20for%20TEXT%20classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
!pip install transformers
!pip install plotly
!pip install livelossplot

In [None]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

### Downloading the Dataset

In [None]:
!kaggle datasets download -d rmisra/news-headlines-dataset-for-sarcasm-detection

In [None]:
!unzip /content/news-headlines-dataset-for-sarcasm-detection.zip

### Imports

In [None]:
import numpy as np
import pandas as pd

import os
import re
from pathlib import Path
import plotly.express as px

import tensorflow as tf

from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau, LearningRateScheduler

import spacy
from unicodedata import normalize

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

from livelossplot import PlotLossesKeras

### Loading Dataset

In [None]:
df = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
bar = df['is_sarcastic'].value_counts()
print(bar)
bar.plot(kind="bar")

### Cleaning the Texts

In [None]:
nlp = spacy.load("en_core_web_sm")

def clean_data(df, column):

  def lem_stp():

    for doc in nlp.pipe(df[column], disable=["parser", "ner"], batch_size=512):
        yield " ".join(
            [d.lemma_ for d in doc if not d.is_stop]
        )
  
  def clean(text):

    text = str(text).strip()

    if text:
      
      #Normalize Text
      text = normalize("NFKD", text)

      #Remove links 
      text = re.sub(r'https?:\/\/.*?[\s+]', '', text.replace("|"," ") + " ")

      #Strip Punctation
      text = re.sub(r'[^\w\s]','', text)

    return text.strip()

  df[column] = df[column].apply(clean)
  df[column] = list(lem_stp())
  df[column] = df[column].apply(lambda x: re.sub("\s+", " ", x.strip()))
  df[column] = df[column].apply(lambda x: x if len(x.split()) >= 5 else None)

  return df[column]

In [None]:
df["Cleaned"] = clean_data(df, "headline")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
X = df["Cleaned"]
y = df["is_sarcastic"]

### Calculating Sequence Length

In [None]:
length = [len(x.split()) for x in X]

In [None]:
px.box(length)

### Tokenization

In [None]:
checkpoint = "bert-base-uncased"
# checkpoint = "gpt2"
sequence_length = 64

def tokenize(samples):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  if checkpoint == "gpt2" and tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
  )

  return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [None]:
X_tokenized = pd.DataFrame(tokenize(X.tolist()), columns=["input_ids", "attention_mask"])

In [None]:
X_tokenized

In [None]:
X_tokenized.loc[0]

In [None]:
y

### Splitting Data into Train/Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tokenized, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
X_train

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### Unzipping the IDs and Masks

In [None]:
X_train["input_ids"][0]

In [None]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

X_train, X_test = unzip_x(X_train), unzip_x(X_test)

In [None]:
X_train

### Building the model

In [None]:
def build_model(df, targets, checkpoint, sequence_length):

  base_model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

  input_ids = Input(shape=(sequence_length,), name="input_ids", dtype="int32")
  attenion_mask = Input(shape=(sequence_length,), name="attention_mask", dtype="int32")

  if checkpoint == "gpt2": x = base_model.transformer(input_ids, attention_mask=attenion_mask)[0]
  else: x = base_model.bert(input_ids, attention_mask=attenion_mask)[1]

  x = Flatten()(x)

  units = df[targets].nunique()

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1

  outputs = Dense(units, activation = activation, name = f"{targets}_outputs")(x)

  model = Model(inputs=[input_ids, attenion_mask], outputs=outputs)

  optimizer =  tf.keras.optimizers.Adam()

  model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

  # Model Architecture Export
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, 
                          show_dtype=True, show_layer_names=True, rankdir='TB',
                          expand_nested=True, dpi=300, layer_range=None, 
                          show_layer_activations=True)

  return model

In [None]:
model = build_model(df, "is_sarcastic", checkpoint, sequence_length)

In [None]:
model.summary()

### Callbacks

In [None]:
def callbacks() -> list:

    run_name = "run 1"
    save_path = Path("models")
    os.makedirs(save_path/"logs", exist_ok=True)
    
    checkpoint = ModelCheckpoint(save_path, monitor="val_loss", save_best_only=True, 
                                                    verbose=1)

    earlystopping = EarlyStopping(monitor="val_loss", verbose=1, restore_best_weights = True,
                                                    patience=5)

    logger = TensorBoard(save_path/"logs"/run_name, histogram_freq=2, write_graph=True, write_images=True)

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, verbose=1,
                                         min_delta=0.0001, cooldown=0, min_lr=0)
    
    return [checkpoint, earlystopping, lr, logger, PlotLossesKeras()]

### Model Training

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=8, callbacks=callbacks())