# Dependencies
List of dependencies and codes to run in order to get this notebook to work in a GoogleColab environment

In [1]:
# Mount Your Google drive
from google.colab import drive
drive.mount('/content/drive')

# Install library to be use
!pip install transformers

Mounted at /content/drive
Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.

# Blabber Cleaning
#### (By: Mark Ehab Aziz)
#### (Built Under: Python 3.11.4)
Filtering out and cleaning text data.
As tasked inside the 'to do.txt'.

Ensure the presence of nltk package using `pip install nltk`.

Following usage of nltk should not require further dependencies than the basic install and stopwords.

If anything; Ensure presence of `nltk`, the download for stopwords is within the cells and will download automatically should it not detect any instance of predownloaded stopwords for itself.

## Note:
You may encounter (Window Not Responding), in which case; kindly wait for it, as the notebook's size seems to increase by a lot after running the stemmer.

In [2]:
# Importing Libraries
import pandas as pd                         # Loading Data
import numpy as np
import nltk                                 # Required to download stopwords set
from nltk.corpus import stopwords           # Load Stopwords
from nltk.tokenize import regexp_tokenize   # To Tokenize words with Regex Expressions
from nltk.tokenize import word_tokenize     # Tokenizer too
from nltk.stem import PorterStemmer         # Stemming words
from nltk.stem import SnowballStemmer       # Improved Stemming
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Loading data into environment
# Using two methods (As stated in my previous projects)
# 1. Path working within my git repo
#blab = pd.read_csv("../dataset/train.csv")

# 2. Path when data is within the same folder
#blab = pd.read_csv("./train.csv")

# 3. Path within Google Colab
blab = pd.read_csv('./drive/MyDrive/train.csv')

# Data Exploration
Using `.head(n)` to show the first $n^{th}$ rows of the dataset.

In [4]:
# Defining n rows to see
n = 5

# Showing head
blab.head(n)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


As stated by our todo list, we are only tasked with cleaning of the text, so we'll be focusing on `comment_text`.

Referring to our todo list once again, we will be dropping `id`, `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, and `identity_hate`; as we are not concerned with classifying the sentiment or the meaning behind any of the comments.

Reminder for what to be done:
- Read Text
- Clean Text (Capitalisation, punctuation)
- Remove Stop Words
- Tokenization
- Stemming

Under no aforementioned task will we be using the columns I have mentioned to drop.

In [5]:
# Defining list of columns to be dropped
col_droppable = ["id"]

# Dropping
txt_blab = blab.drop(columns = col_droppable)

# Viewing
txt_blab.head(n)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
# Removing '\n' '\r' '\t' from every line
txt_blab.replace(r'[\r\n\t]', ' ', regex = True, inplace=True)

# As noted, there are no escape characters for spaces, as new line or tab
txt_blab.head(n)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Cleaning Above Sentences
Using the NLTK library for Python; will be copy-pasting or creating patterns that are enough to extract words, starting with either upper or lower case letters.

This may violate the order of operations specified in the ToDo list, as cleaning data preceeds tokenization, but `regexp_tokenize()` takes care of both steps anyway, through just matching what is specified within the regex, as only 'Latin Alphabet' ranges are specified (`A-Za-z`), it will automatically unmatch any special character or non-alphabet character, ignores punctuation as well.

Will also be removing the URLs as specified.

In [7]:
# Defining Regex patterns
# Match words starting with Uppercase letters
upper_words = r"([A-Z])\w+"

# Match Words that start with either Upper/lowercase letters
upper_lower_words = r"[A-Za-z]\w+"

# Match URLs
url_pattern = r"(http|ftp|https):\/\/([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?"

In [8]:
# Removing URLs (Standard URL Scheme, There still exist instances
# of just 'https' or 'http' randomly written, they will just be
# treated like normal words and tokenized as the rest)
txt_blab.replace(url_pattern, '', regex = True, inplace = True)

txt_blab.head(n)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Tokenization
Iterating over each row of the given textual data, accessing as a string instead of a usual row in order to yield the full entry.

Using regex to tokenize words by matching pattern.

In [9]:
# Instantiate a list of tokens, to hold tokens of each entry
# Probably better to use a dictionary if we care about count (?)
# Still have to access every token and change to lower (Taken care of in flat list)
token_per_row = []

# Start and finish indecies of iterator
# Bound to become the length of the file eventually
for i in range(txt_blab.shape[0]):
    # Grab string fully from dataframe
    line = txt_blab.iloc[i,0]

    # Append list of tokens
    # 2D list of lists; each containing tokens of each row
    token_per_row.append(regexp_tokenize(line, upper_lower_words))

In [10]:
# Flatten the 2D List of Lists
# n^2 operation but still gets the job done
# Would be better to flatten as soon as
# the tokens are fresh out the tokenizer
def flatten(list_o_lists):
    # init flat list
    flat = []

    # Loop over every list within the list
    for sublist in list_o_lists:
        # Loop over every token within the sublist
        # being iterated on
        for token in sublist:
            # Append token to flat list
            flat.append(token)

    return flat

In [11]:
# Call the List flatter
flat_tokens = flatten(token_per_row)

print(flat_tokens[0:20])

['Explanation', 'Why', 'the', 'edits', 'made', 'under', 'my', 'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted', 'They', 'weren', 'vandalisms', 'just', 'closure', 'on', 'some']


In [12]:
# Downloading the stopwords
# Already installed so will comment it out
nltk.download('stopwords')

# Need to remove stop words too
# changing the stopwords from a list to a set (Performance Upgrade)
ENGLISH_STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Stopword Removal
Prior to removing stopwords, one has to change the case of the tokens (words) to be lowercase, which is also what is asked of us to do within the ToDo list.

In [13]:
# Changing all words into lowercase
# Using a list comprehension to change it more efficienty
# (They're better than for loops)
flat_tokens = [token.lower() for token in flat_tokens]

# Was 'Explanation', should be 'explanation'
print(flat_tokens[0])

explanation


In [14]:
# Actually removing stopwords
stop_free = [token for token in flat_tokens if token not in ENGLISH_STOPWORDS]

# Getting how many words remained after removal of stopwords
print(len(stop_free))

# Getting how many words were tokenized (Both stop and non-stop)
print(len(flat_tokens))

5425847
10130248


# Stemming
Changing the word back to its roots. Through using the 'Porter Algorithm'. (Fast and Effective, Not very accurate)

In [15]:
# Using porterstemmer
stemmer = PorterStemmer()

# Inserting the semmed words into a list
# Will be dividing into "batches" due to
# reaching maximum recursion depth if
# if all entries are sent at once
stemmed_words0 = [stemmer.stem(word) for word in stop_free[:1000000]]
print('Length of list 1: {}'.format(len(stemmed_words0)))
stemmed_words1 = [stemmer.stem(word) for word in stop_free[1000000:2000000]]
print('Length of list 2: {}'.format(len(stemmed_words1)))
stemmed_words2 = [stemmer.stem(word) for word in stop_free[2000000:3000000]]
print('Length of list 3: {}'.format(len(stemmed_words2)))

# Explaining the 500k word skip:
# Kernel would throw an error due to "Reaching Maximum Recursion Depth"
# on coming across a certain word which seems to have it lock up
# after changing the indecies a little, this configuration works best
# could probably be fine tuned to find the word that messes it up
stemmed_words3 = [stemmer.stem(word) for word in stop_free[3000000:3500000]]
print('Length of list 4: {}'.format(len(stemmed_words3)))

stemmed_words4 = [stemmer.stem(word) for word in stop_free[4000000:5000000]]
print('Length of list 5: {}'.format(len(stemmed_words4)))
stemmed_words5 = [stemmer.stem(word) for word in stop_free[5000000:]]
print('Length of list 6: {}'.format(len(stemmed_words5)))

Length of list 1: 1000000
Length of list 2: 1000000
Length of list 3: 1000000
Length of list 4: 500000
Length of list 5: 1000000
Length of list 6: 425847


In [16]:
# Joining lists
total_stemmed_words = stemmed_words0 + stemmed_words1 + stemmed_words2 + stemmed_words3 + stemmed_words4 + stemmed_words5

# Printing number of words
print('Total number of stemmed words: {}'.format(len(total_stemmed_words)))

# Displaying some words
print(total_stemmed_words[0:10])

Total number of stemmed words: 4925847
['explan', 'edit', 'made', 'usernam', 'hardcor', 'metallica', 'fan', 'revert', 'vandal', 'closur']


As we can see, a lot of words are either missing an e at the end, or not even english anymore, that is due to Stemmer using a crude old method, which is aimed for speed and efficiency, unlike lemmatizaton which morphologically analyses lexical changes in words to revert them back to their roots, unlike the chopping of "commonly found prefixes/suffixes" which stemming does.

After a bit more searching and digging around, within the NLTK package there exists another variant of the PorterStemmer called SnowballStemmer, which fixes the above issues regarding missing an e or plain out non-english words.

It will be used over the porterstemmer for the following tasks, whilst keeping the porter stemmer cells to highlight the differences.

# Assignment Continuation
Relevant tasks were assigned to be done on the same notebook, the are present within the `README.md`, but as a quick reminder I will list them here.

1. Bag of Words
2. Word Embeddings
3. Use BERT and Evaluate

# Variable Definition
We will be using the following variables carried over from the previous part of this notebook, namely:
- `ENGLISH_STOPWORDS`: Constant for holding the stopwords found in the English language.
- `flat_tokens`: Tokens from every row parsed into a single list of tokens.
- `stop_free`: List of tokens free of stopwords.
- `total_stemmed_words`: List of stemmed words from tokens.
- `txt_blab`: Dataframe with cleaned comments.

In [17]:
# Instantiating a stemmer object
snowball = SnowballStemmer("english")

# Iterate over the flattened list of words
total_stemmed_words = [snowball.stem(word) for word in stop_free]

In [18]:
# Printing number of words
print(len(total_stemmed_words))

# Print some words
print(total_stemmed_words[0:10])

5425847
['explan', 'edit', 'made', 'usernam', 'hardcor', 'metallica', 'fan', 'revert', 'vandal', 'closur']


In [19]:
# Manually implementing a bag of words
# Essentially just a dictionary with word count
bag_o_words = {}

for token in total_stemmed_words:
    if token in bag_o_words:
        bag_o_words[token] += 1
    else:
        bag_o_words[token] = 1

In [20]:
# Parsing into a dataframe
bow_df = pd.DataFrame.from_dict(bag_o_words,
                                orient = 'index',
                                columns = ['frequency']
                                ).sort_values(
                                    by = 'frequency',
                                    ascending = False)

display(bow_df.head(10), bow_df.shape)

Unnamed: 0,frequency
articl,74137
page,57083
wikipedia,46144
edit,41537
talk,39517
use,35162
one,30731
like,30476
pleas,29969
would,29322


(135213, 1)

# Working on Dataframe itself
Previous cells worked on extracted rows within the `txt_blab` dataframe, therefore just piling words on top of words aimlessly, though it can be considered some sort of analysis of word count if need be.

Following this cell onwards, similar work will be done on the rows *within* the dataframe, instead of completely extracting.

Task at hand, later down the line, is to classify the data if it's toxic or normal discussion.

Hence collapsing the columns beyond `toxic` onto it through sum, if it's not $0$ it will be considered toxic, with a variable degree.

In [21]:
# Collepsing the column values onto toxic
txt_blab['toxic'] = txt_blab['severe_toxic'] + txt_blab['obscene'] + txt_blab['threat'] + txt_blab['insult'] + txt_blab['identity_hate']

In [22]:
# Dropping collapsed columns
collapsed_cols = ['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

txt_blab.drop(columns = collapsed_cols, inplace = True)

display(txt_blab.head(), txt_blab.describe())

Unnamed: 0,comment_text,toxic
0,Explanation Why the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,""" More I can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


Unnamed: 0,toxic
count,159571.0
mean,0.124108
std,0.513515
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,5.0


As we can see, from the collapse, values larger than 1 arose, therefore altering their value to 0 would be optimum.

In [23]:
txt_blab['toxic'] = np.where(txt_blab['toxic'] > 1, txt_blab['toxic'] - (txt_blab['toxic'] - 1), txt_blab['toxic'])

display(txt_blab.head(), txt_blab.describe())

Unnamed: 0,comment_text,toxic
0,Explanation Why the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,""" More I can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


Unnamed: 0,toxic
count,159571.0
mean,0.066171
std,0.248582
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [24]:
# Downloading the english words
# To avoid random words later
nltk.download('words')
ENGLISH_WORDS = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [25]:
# Tokenizing words per row
txt_blab['tokens'] = [regexp_tokenize(row, upper_lower_words) for row in txt_blab['comment_text']]

txt_blab.head()

Unnamed: 0,comment_text,toxic,tokens
0,Explanation Why the edits made under my usern...,0,"[Explanation, Why, the, edits, made, under, my..."
1,D'aww! He matches this background colour I'm s...,0,"[aww, He, matches, this, background, colour, s..."
2,"Hey man, I'm really not trying to edit war. It...",0,"[Hey, man, really, not, trying, to, edit, war,..."
3,""" More I can't make any real suggestions on ...",0,"[More, can, make, any, real, suggestions, on, ..."
4,"You, sir, are my hero. Any chance you remember...",0,"[You, sir, are, my, hero, Any, chance, you, re..."


In [26]:
# Change words within a list to be lowercase
def list_lower(word_list):
    return [word.lower() for word in word_list]

# Remove stopwords from a list of tokens
def stopwordless(word_list):
    return [word for word in word_list if word not in ENGLISH_STOPWORDS]

# Stem tokens within a list
def list_stem(word_list):
    return [snowball.stem(token) for token in word_list]

# Removing words that do not belong to the english language
# Or completely random words..
def englishify(token_list):
    return [word for word in token_list if word in ENGLISH_WORDS]

In [27]:
# Lower casing each word
txt_blab['tokens'] = [list_lower(row) for row in txt_blab['tokens']]

txt_blab.head()

Unnamed: 0,comment_text,toxic,tokens
0,Explanation Why the edits made under my usern...,0,"[explanation, why, the, edits, made, under, my..."
1,D'aww! He matches this background colour I'm s...,0,"[aww, he, matches, this, background, colour, s..."
2,"Hey man, I'm really not trying to edit war. It...",0,"[hey, man, really, not, trying, to, edit, war,..."
3,""" More I can't make any real suggestions on ...",0,"[more, can, make, any, real, suggestions, on, ..."
4,"You, sir, are my hero. Any chance you remember...",0,"[you, sir, are, my, hero, any, chance, you, re..."


In [28]:
txt_blab['tokens'] = [englishify(row) for row in txt_blab['tokens']]

txt_blab.head()

Unnamed: 0,comment_text,toxic,tokens
0,Explanation Why the edits made under my usern...,0,"[explanation, why, the, made, under, my, fan, ..."
1,D'aww! He matches this background colour I'm s...,0,"[he, this, background, colour, seemingly, stuc..."
2,"Hey man, I'm really not trying to edit war. It...",0,"[hey, man, really, not, trying, to, edit, war,..."
3,""" More I can't make any real suggestions on ...",0,"[more, can, make, any, real, on, improvement, ..."
4,"You, sir, are my hero. Any chance you remember...",0,"[you, sir, are, my, hero, any, chance, you, re..."


In [29]:
txt_blab['stems'] = [list_stem(stopwordless(row)) for row in txt_blab['tokens']]

txt_blab.head()

Unnamed: 0,comment_text,toxic,tokens,stems
0,Explanation Why the edits made under my usern...,0,"[explanation, why, the, made, under, my, fan, ...","[explan, made, fan, closur, gas, new, york, pl..."
1,D'aww! He matches this background colour I'm s...,0,"[he, this, background, colour, seemingly, stuc...","[background, colour, seem, stuck, thank, talk]"
2,"Hey man, I'm really not trying to edit war. It...",0,"[hey, man, really, not, trying, to, edit, war,...","[hey, man, realli, tri, edit, war, guy, consta..."
3,""" More I can't make any real suggestions on ...",0,"[more, can, make, any, real, on, improvement, ...","[make, real, improv, section, statist, later, ..."
4,"You, sir, are my hero. Any chance you remember...",0,"[you, sir, are, my, hero, any, chance, you, re...","[sir, hero, chanc, rememb, page]"


In [30]:
# Making a dummy tokenizer in order to make the vectorizer work
# returns what it takes (We already have tokenized sentences)
def identity_tokenizer(text):
    return text

In [31]:
# Instantiate the vectorizer with custom params
tfidf_vec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase = False)

# Fit the vectorizer
# Fitting on Unstemmed words (idk not sure)
transformFit = tfidf_vec.fit_transform(txt_blab.iloc[:10000, 3])

# Storing the tfidf values
tfidf_vals = pd.DataFrame(transformFit.toarray().transpose(), tfidf_vec.get_feature_names_out()) # type: ignore



In [32]:
print(tfidf_vals)

          0     1     2     3     4     5     6     7     8     9     ...  \
aa         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
aardvark   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
aba        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
aback      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
abandon    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
zodiac     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
zone       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
zoo        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
zoolog     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
zoom       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

          9990  9991  9992  9993  9994  9995  9996  9997  9998  9999  
aa  

# BERT
Loading, Training, Fitting, BERT onto the data.

Labels are under the column of `toxic`, where 1 stands for toxic, and 0 is for non-toxic.

We will be passing the whole comment text to our BERT model as we will be importing its own tokenizer.

In [33]:
import torch                                              # Torch
import torch.nn as nn                                     # Torch's NN
from sklearn.model_selection import train_test_split      # Splitting Data
from transformers import AutoModel, BertTokenizerFast     # BERT Related imports
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW                            # Optimizer from Huggingface
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

In [34]:
# Tell Torch to use a cuda device
gpu = torch.device("cuda")

In [35]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(txt_blab['comment_text'], txt_blab['toxic'],
                                                                    random_state = 10,
                                                                    test_size = 0.3)


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state = 1,
                                                                test_size = 0.5,
                                                                stratify = temp_labels)

In [36]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [37]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    padding='max_length',
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    padding='max_length',
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    padding='max_length',
    truncation=True
)

In [38]:
# convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [39]:
#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [40]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [41]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.1)

        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)

        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        # apply softmax activation
        x = self.softmax(x)

        return x

In [42]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(gpu)

In [43]:
optimizer = AdamW(model.parameters(),lr = 0.00001)



In [44]:
#compute the class weights
class_weights = compute_class_weight(class_weight = "balanced",
                                        classes = np.unique(train_labels),
                                        y = train_labels)

print("Class Weights:",class_weights)

Class Weights: [0.53567523 7.50766232]


In [45]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(gpu)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 10

In [46]:
# function to train the model
def train():

    model.train()
    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds=[]

    # iterate over batches
    for step,batch in enumerate(train_dataloader):

        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(gpu) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [47]:
# function for evaluating the model
def evaluate():

    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        # push the batch to gpu
        batch = [t.to(gpu) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [48]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    #train model
    train_loss, _ = train()

    #evaluate model
    valid_loss, _ = evaluate()

    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of  3,491.
  Batch   100  of  3,491.
  Batch   150  of  3,491.
  Batch   200  of  3,491.
  Batch   250  of  3,491.
  Batch   300  of  3,491.
  Batch   350  of  3,491.
  Batch   400  of  3,491.
  Batch   450  of  3,491.
  Batch   500  of  3,491.
  Batch   550  of  3,491.
  Batch   600  of  3,491.
  Batch   650  of  3,491.
  Batch   700  of  3,491.
  Batch   750  of  3,491.
  Batch   800  of  3,491.
  Batch   850  of  3,491.
  Batch   900  of  3,491.
  Batch   950  of  3,491.
  Batch 1,000  of  3,491.
  Batch 1,050  of  3,491.
  Batch 1,100  of  3,491.
  Batch 1,150  of  3,491.
  Batch 1,200  of  3,491.
  Batch 1,250  of  3,491.
  Batch 1,300  of  3,491.
  Batch 1,350  of  3,491.
  Batch 1,400  of  3,491.
  Batch 1,450  of  3,491.
  Batch 1,500  of  3,491.
  Batch 1,550  of  3,491.
  Batch 1,600  of  3,491.
  Batch 1,650  of  3,491.
  Batch 1,700  of  3,491.
  Batch 1,750  of  3,491.
  Batch 1,800  of  3,491.
  Batch 1,850  of  3,491.
  Batch 1,900  of  3,49

In [49]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [50]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(gpu), test_mask.to(gpu))
    preds = preds.detach().cpu().numpy()

OutOfMemoryError: ignored

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))