<a href="https://colab.research.google.com/github/Satwikram/Transformers-Workshop/blob/main/Transformers%20-%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram

### Setup

In [None]:
!pip install transformers

Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### Connecting to Kaggle

In [None]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


### Importing Dependencies

In [None]:
import numpy as np
import pandas as pd

import os
import re
from pathlib import Path

import tensorflow as tf

from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau

import spacy
from unicodedata import normalize

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

import plotly.express as px

from textblob import TextBlob

### Downloading the Dataset

[link text](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 78% 20.0M/25.7M [00:00<00:00, 103MB/s] 
100% 25.7M/25.7M [00:00<00:00, 98.3MB/s]


### Unzipping the dataset

In [None]:
!unzip /content/imdb-dataset-of-50k-movie-reviews.zip

Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


### Reading the Dataset

In [None]:
df = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Basic Info

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


### Lets take Random 5 samples

In [None]:
positive = df[df['sentiment'] == "positive"].sample(n=500, random_state=42)
negative = df[df['sentiment'] == "negative"].sample(n=500, random_state=42)

### Check for duplicates

In [None]:
positive.duplicated().sum()

0

In [None]:
negative.duplicated().sum()

0

### Drop the duplicates if there is any

In [None]:
positive.drop_duplicates(inplace=True)

In [None]:
negative.drop_duplicates(inplace=True)

In [None]:
positive.duplicated().sum()

0

In [None]:
negative.duplicated().sum()

0

### Concat the two dataframes to one dataframe

In [None]:
df = pd.concat([positive, negative])

### Value Counts

In [None]:
px.bar(df["sentiment"].value_counts())

### Cleaning the dataset


In [None]:
nlp = spacy.load("en_core_web_sm")

def clean_data(df, column):

  def lem_stp():

    for doc in nlp.pipe(df[column], disable=["parser", "ner"], batch_size=512):
        yield " ".join(
            [d.lemma_ for d in doc if not d.is_stop]
        )
  
  def remove_html_tags(text):

    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)


  def clean(text):

    text = str(text).strip()

    if text:
      
      #Remove HTML tags
      text = remove_html_tags(text)

      #Normalize Text
      text = normalize("NFKD", text)

      #Remove links 
      text = re.sub(r'https?:\/\/.*?[\s+]', '', text.replace("|"," ") + " ")

      #Strip Punctation
      text = re.sub(r'[^\w\s]','', text)

    return text.strip()

  df[column] = df[column].apply(clean)
  df[column] = list(lem_stp())
  df[column] = df[column].apply(lambda x: re.sub("\s+", " ", x.strip()))
  df[column] = df[column].apply(lambda x: x if len(x.split()) >= 5 else None)

  return df[column]

In [None]:
df["Cleaned"] = clean_data(df, "review")

### Reset the index values

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

Unnamed: 0,review,sentiment,Cleaned
0,not know film meager rating IMDb film accompan...,positive,not know film meager rating IMDb film accompan...
1,long time like good canadian actor head south ...,positive,long time like good canadian actor head south ...
2,Terry Gilliams David Peoples team create intel...,positive,Terry Gilliams David Peoples team create intel...
3,antiestablishment film produce time colourless...,positive,antiestablishment film produce time colourless...
4,movie 48 year end Civil Warmost likely anticip...,positive,movie 48 year end Civil Warmost likely anticip...
...,...,...,...
995,waste time watch want study wrong thing not go...,negative,waste time watch want study wrong thing not go...
996,movie happen HBO yesterday watch mistake guess...,negative,movie happen HBO yesterday watch mistake guess...
997,John Travolta reprise role Chili Palmer Hollyw...,negative,John Travolta reprise role Chili Palmer Hollyw...
998,Puerto rican bad documentary ve see type peopl...,negative,Puerto rican bad documentary ve see type peopl...


### Calculate the Sequence length

In [None]:
df["len"] = df["review"].apply(lambda x: len(x.split()))

In [None]:
px.box(df["len"])

### Tokenization

In [None]:
checkpoint = "bert-base-uncased"
sequence_length = 512

def tokenize(samples):

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    if checkpoint == "gpt2" and tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
    )

    return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [None]:
X = pd.DataFrame(tokenize(df["Cleaned"].tolist()), columns=["input_ids", "attention_mask"])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
X

Unnamed: 0,input_ids,attention_mask
0,"[101, 2025, 2113, 2143, 2033, 17325, 5790, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[101, 2146, 2051, 2066, 2204, 3010, 3364, 2132...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[101, 6609, 12267, 25107, 2015, 2585, 7243, 21...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[101, 3424, 4355, 7875, 13602, 3672, 2143, 396...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[101, 3185, 4466, 2095, 2203, 2942, 4010, 1412...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
995,"[101, 5949, 2051, 3422, 2215, 2817, 3308, 2518...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
996,"[101, 3185, 4148, 14633, 7483, 3422, 6707, 398...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
997,"[101, 2198, 19817, 11431, 27914, 2050, 16851, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
998,"[101, 5984, 13641, 2919, 4516, 2310, 2156, 282...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


### Taking X and Y

In [None]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

In [None]:
df["sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [None]:
y = df["sentiment"].values

### Splitting Data into Train/Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

X_train, X_test = unzip_x(X_train), unzip_x(X_test)

### Building Model

In [None]:
def build_model(df, targets, checkpoint, sequence_length):

  base_model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

  input_ids = Input(shape=(sequence_length,), name="input_ids", dtype="int32")
  attenion_mask = Input(shape=(sequence_length,), name="attention_mask", dtype="int32")

  if checkpoint == "gpt2": x1 = base_model.transformer(input_ids, attention_mask=attenion_mask)[0]
  else: x1 = base_model.bert(input_ids, attention_mask=attenion_mask)[1]

  print(x1)
  x1 = Flatten()(x1)

  units = df[targets].nunique()

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1

  outputs = Dense(units, activation = activation, name = f"{targets}_outputs")(x1)

  model = Model(inputs=[input_ids, attenion_mask], outputs=outputs)

  optimizer =  tf.keras.optimizers.Adam()

  model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

  tf.keras.utils.plot_model(model, to_file ="model.png", show_shapes = True , show_dtype = True,
                            show_layer_names = True, expand_nested = True, dpi = 300,
                            show_layer_activations = True, show_trainable = True)

  return model

In [None]:
model = build_model(df, "sentiment", checkpoint, sequence_length)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='bert/pooler/dense/Tanh:0', description="created by layer 'bert'")


### Model Summary

In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

### Callbacks

In [None]:
def callbacks() -> list:

    run_name = "run 1"
    save_path = Path("models")
    os.makedirs(save_path/"logs", exist_ok=True)
    
    checkpoint = ModelCheckpoint(save_path, monitor="val_loss", save_best_only=True, 
                                                    verbose=1)

    earlystopping = EarlyStopping(monitor="val_loss", verbose=1, restore_best_weights = True,
                                                    patience=5)

    logger = TensorBoard(save_path/"logs"/run_name, histogram_freq=2, write_graph=True, write_images=True)

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, verbose=1,
                                         min_delta=0.0001, cooldown=0, min_lr=0)
    
    return [checkpoint, earlystopping, lr, logger]

### Training the model

In [None]:
y_train

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=8, callbacks=callbacks())

  2/100 [..............................] - ETA: 1:41:58 - loss: 1.0896 - accuracy: 0.6250