<a href="https://colab.research.google.com/github/Satwikram/Transformers-Workshop/blob/main/Transformers%20-%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram

### Setup

In [None]:
!pip install transformers

Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### Connecting to Kaggle

In [None]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


### Importing Dependencies

In [None]:
import numpy as np
import pandas as pd

import os
import re
from pathlib import Path

import tensorflow as tf

from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau

import spacy
from unicodedata import normalize

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

import plotly.express as px

from textblob import TextBlob

### Downloading the Dataset

[link text](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 66% 17.0M/25.7M [00:00<00:00, 41.2MB/s]
100% 25.7M/25.7M [00:00<00:00, 54.0MB/s]


In [None]:
!unzip /content/imdb-dataset-of-50k-movie-reviews.zip

Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
df = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Basic Info

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
positive = df[df['sentiment'] == "positive"].sample(n=500, random_state=42)
negative = df[df['sentiment'] == "negative"].sample(n=500, random_state=42)

In [None]:
positive.duplicated().sum()

0

In [None]:
negative.duplicated().sum()

0

In [None]:
positive.drop_duplicates(inplace=True)

In [None]:
negative.drop_duplicates(inplace=True)

In [None]:
positive.duplicated().sum()

0

In [None]:
negative.duplicated().sum()

0

In [None]:
df = pd.concat([positive, negative])

In [None]:
px.bar(df["sentiment"].value_counts())

### Cleaning the dataset


In [None]:
nlp = spacy.load("en_core_web_sm")

def clean_data(df, column):

  def lem_stp():

    for doc in nlp.pipe(df[column], disable=["parser", "ner"], batch_size=512):
        yield " ".join(
            [d.lemma_ for d in doc if not d.is_stop]
        )
  
  def remove_html_tags(text):

    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)


  def clean(text):

    text = str(text).strip()

    if text:
      
      #Remove HTML tags
      text = remove_html_tags(text)

      #Normalize Text
      text = normalize("NFKD", text)

      #Remove links 
      text = re.sub(r'https?:\/\/.*?[\s+]', '', text.replace("|"," ") + " ")

      #Strip Punctation
      text = re.sub(r'[^\w\s]','', text)

    return text.strip()

  df[column] = df[column].apply(clean)
  df[column] = list(lem_stp())
  df[column] = df[column].apply(lambda x: re.sub("\s+", " ", x.strip()))
  df[column] = df[column].apply(lambda x: x if len(x.split()) >= 5 else None)

  return df[column]

In [None]:
df["Cleaned"] = clean_data(df, "review")

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

Unnamed: 0,review,sentiment,Cleaned,len
0,film silent movie certainly feel extremely ext...,positive,film silent movie certainly feel extremely ext...,52
1,New Years Eve tuberculous sister Salvation Arm...,positive,New Years Eve tuberculous sister Salvation Arm...,127
2,br br perfect film throwback glitxysterle roma...,positive,br br perfect film throwback glitxysterle roma...,90
3,gosh learn pretty fast film youngster 20 year ...,positive,gosh learn pretty fast film youngster 20 year ...,137
4,brilliant film great John Waters character unf...,positive,brilliant film great John Waters character unf...,19
...,...,...,...,...
995,s underlie current positive review movie brain...,negative,s underlie current positive review movie brain...,142
996,actually pretty funny god hell movie kind way ...,negative,actually pretty funny god hell movie kind way ...,127
997,John Carpenters career sad excuse movie indica...,negative,John Carpenters career sad excuse movie indica...,215
998,newly release dvd stay far away itbr br usuall...,negative,newly release dvd stay far away itbr br usuall...,61


In [None]:
df["len"] = df["review"].apply(lambda x: len(x.split()))

In [None]:
px.box(df["len"])

In [None]:
checkpoint = "bert-base-uncased"
sequence_length = 512

def tokenize(samples):

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    if checkpoint == "gpt2" and tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    tokens = tokenizer(
      samples,
      max_length=sequence_length,
      truncation=True,
      padding="max_length",
      add_special_tokens=True,
      return_tensors="np"
    )

    return {"input_ids": tokens["input_ids"].tolist(), "attention_mask": tokens["attention_mask"].tolist()}

In [136]:
X = pd.DataFrame(tokenize(df["Cleaned"].tolist()), columns=["input_ids", "attention_mask"])

In [137]:
X

Unnamed: 0,input_ids,attention_mask
0,"[101, 2143, 4333, 3185, 5121, 2514, 5186, 5186...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[101, 2047, 2086, 6574, 7270, 11890, 16203, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[101, 7987, 7987, 3819, 2143, 5466, 5963, 1043...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[101, 2175, 4095, 4553, 3492, 3435, 2143, 2402...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[101, 8235, 2143, 2307, 2198, 5380, 2839, 4895...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
995,"[101, 1055, 2104, 8751, 2783, 3893, 3319, 3185...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
996,"[101, 2941, 3492, 6057, 2643, 3109, 3185, 2785...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
997,"[101, 2198, 10533, 2015, 2476, 6517, 8016, 318...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
998,"[101, 4397, 2713, 4966, 2994, 2521, 2185, 2009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [138]:
unzip_x = lambda x: [np.vstack(x["input_ids"]), np.vstack(x["attention_mask"])]

In [139]:
y = df["sentiment"].values

### Splitting Data into Train/Test

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

X_train, X_test = unzip_x(X_train), unzip_x(X_test)