# Transformers


In [1]:
# IMPORTING LIBRARIES
import torch 
from torch import nn
from torch.nn import LayerNorm,Dropout,Linear,Softmax
# import activation function softmax
import torch.nn.functional as F
from torch.nn import Embedding
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [2]:
# Self Attention Architecture class
class MultiheadAttention(nn.Module):
    
    #define instructor
    def __init__(self, dk ,N_heads=9):
        #call instructor of parent (nn.module)
        super().__init__()
        #intialize parameters dk(dimention of Q , K , V) , number of heads
        self.dk , self.N_heads = dk , N_heads
        # initialize weights of Q , K , V matrics
        self.Wk = nn.Linear(dk , dk*N_heads , bias=False)
        self.Wq = nn.Linear(dk , dk*N_heads , bias=False)
        self.Wv = nn.Linear(dk , dk*N_heads , bias=False)
        
        self.Wo = nn.Linear(dk*N_heads , dk , bias=False)
        
    def forward(self , Emb):  
        b , t , k = Emb.size()  # where b is number of batches, t is size of input sequence length and k is number of dimensions
        h = self.N_heads        # where h number of heads
        dk = torch.tensor(k , dtype = torch.float32)
        #reshape each matrics
        keys = self.Wk(Emb).view(b,t,h,k) 
        queries = self.Wq(Emb).view(b,t,h,k)
        values = self.Wv(Emb).view(b,t,h,k)
    
        # now we want head next to batch,so transform keys,values and queries
        keys = keys.transpose(1,2).contiguous().view(b*h,t,k)
        queries = queries.transpose(1,2).contiguous().view(b*h,t,k)
        values = values.transpose(1,2).contiguous().view(b*h,t,k)
        
        #Apply self attention formula soft(QV/sqrt(dk))V
        # now calculate dot product using matmul function in pytorch
        QV = torch.matmul(queries,keys.transpose(-2,-1))
        
        #scale matrics by divide by sqrt(dk) 
        Scaled_QV = QV / torch.sqrt(dk)
        
        #applying softmax over columns
        Scaled_QV = F.softmax(QV,dim=-1)
        
        #apply dotproduct with V  
        out = torch.matmul(Scaled_QV,values).view(b,h,t,k)
        
        # now finally we want output in k dimensions as we have initially, so to do this again transspose out
        out = out.transpose(1,2).contiguous().view(b,t,h*k)
        out = self.Wo(out)
        
        return out  

In [3]:
# now build architecture for Encoder block
class Encoder(nn.Module):
    #define instructor
    def __init__(self,dk,N_heads=9):
        #call instructor of parent (nn.module)
        super().__init__()
        
        #take an object from multiheadattention class
        self.attention = MultiheadAttention(dk)
        
        # now add normalization layer to normalize outputs of attention layer
        self.norm1 = nn.LayerNorm(dk)
        self.norm2 = nn.LayerNorm(dk)
        
        # now make a fully connected multi layer 
        self.ff = nn.Sequential(nn.Linear(dk , 5*dk), nn.ReLU() , nn.Linear(5*dk,dk)) # fully connected layer for hidden states with relu activiation
        self.drop = nn.Dropout(0.5)
    def forward(self,Emb):
        #apply multihead attention mechanism on embeddings
        attention = self.attention(Emb)
        #norm and add layer
        Emb = self.norm1(attention+Emb)
        # dropout layer after normalization(drop some neurons to prevent from overfitting)
        Emb = self.drop(Emb) 
        #feed forward to neral network connection 
        perceptron = self.ff(Emb)
        #second norm and add
        Emb = self.norm2(perceptron + Emb)
        # dropout layer after normalization(drop some neurons to prevent from overfitting)
        Emb = self.drop(Emb)
        return Emb

In our task we don't need to decoder because our goal to classify comments to toxic and non-toxic

In [4]:
# Define a custom classification model as a PyTorch Module
class classify(nn.Module):
    def __init__(self, k, seq_length, num_tokens, depth, num_classes, max_pool=True, heads=9):
        super().__init__()
        self.num_tokens = num_tokens
        self.maxpool = max_pool

        # Define an embedding layer for token indices
        self.tokenemb = Embedding(embedding_dim=k, num_embeddings=num_tokens)

        # Define a position embedding layer
        self.posemb = Embedding(embedding_dim=k, num_embeddings=seq_length)

        # Create a list to hold multiple transformer blocks
        tfblocks = []

        # Create 'depth' number of transformer blocks
        for i in range(depth):
            tfblocks.append(Encoder(k))

        # Create a sequential layer containing the transformer blocks
        self.transform = nn.Sequential(*tfblocks)

        # Add a linear layer to convert output to the desired number of classes (e.g., 2 for binary classification)
        self.prob = nn.Linear(k, num_classes)

        # Add a dropout layer for regularization
        self.drop = nn.Dropout(0.5)

    def forward(self, x, y):
        # Token embedding layer: map input token indices to continuous vector representations
        tokens = self.tokenemb(x)
        b, t, k = tokens.size()

        # Position embedding: add positional information to the token embeddings
        positions = self.posemb(torch.arange(t, device=torch.device('cpu')))[None, :, :].expand(b, t, k)
        x = tokens + positions  # Combine token embeddings and position embeddings

        # Apply dropout for regularization
        x = self.drop(x)

        # Pass the data through the sequential transformer blocks
        x = self.transform(x)

        # Apply max-pooling or mean-pooling based on 'max_pool' flag
        x = x.max(dim=1)[0] if self.maxpool else x.mean(dim=1)

        # Pass the pooled output through a linear layer for classification
        x = self.prob(x)

        # Apply softmax activation to get class probabilities
        x = F.softmax(x, dim=1)

        # Calculate the CrossEntropy loss for classification
        loss = torch.nn.CrossEntropyLoss()
        loss = loss(x, y)

        return loss, x


In [5]:
import pandas as pd
import numpy as np
#import string library to deal with text data
import string
#import regular library for replace package
import re
#import stopwords package from nltk lib to use in clean text
from nltk.corpus import stopwords
#import wordpunct_tokenize package from nltk lib to use in clean text
from nltk.tokenize import wordpunct_tokenize
#import package to split data to train and test
from sklearn.model_selection import train_test_split

In [6]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [7]:
#download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
data = pd.read_csv('train.csv')

In [9]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [10]:
#select reviews columns names of comments
columns = data.columns.tolist()[2:]

In [11]:
#add column toxic to classify the comment
data['toxic'] = np.where(data[columns].sum(axis=1) > 0, 1, 0)

In [12]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [13]:
#this function to clean data
def text_clean(txt) :
  #remove urls from text
    txt = re.sub(r'(http|www)\S+', ' ', txt)
  #alphanumeric character pattern
    txt = re.sub("[^\w]" ,' ' , txt)
    txt = txt.split()
  #lower word and remove punctuation and numbers
    Clean = [word.lower() for word in txt
           if word not in string.punctuation
           and not word.isdigit() ]
  #remove stop words
    stopwords_Removed = [word for word in Clean if word not in stopwords.words('english')]
    sentence = ' '.join(stopwords_Removed)
    return sentence

In [14]:
#add column containing data after cleaning 
data['cleaned_comments'] = data['comment_text'].apply(lambda x : text_clean(x))

In [15]:
#select train data and labels
train_data = data['cleaned_comments'].values
labels = data['toxic'].values

In [16]:
# Split the dataset into training and testing sets with a 80-20 ratio
x_train, x_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2)

# Convert the training data and labels into NumPy arrays
train_data = np.array(x_train)

# Convert the NumPy array back into a Python list
train_data = list(train_data)

# Convert the labels into a Python list
labels = list(y_train)

# Create a test_data list that is a reference to the train_data list
test_data = train_data

# Print the first element of the test_data list
print(test_data[0])

# Initialize an empty list called final_data
final_data = []

# Import the regular expression (re) module
import re

# Iterate through each data element in test_data
for data in test_data:
    # Remove all digits (0-9) from the data using regular expressions
    new = re.sub('[0-9]', '', str(data))
    # Append the modified data to the final_data list
    final_data.append(new)

# Print the first element of the final_data list
print(final_data[0])

# Import the text and sequence modules from TensorFlow's Keras preprocessing library
from tensorflow.keras.preprocessing import text, sequence

# Initialize a Tokenizer
tokenize = text.Tokenizer()

# Fit the tokenizer on the text data in final_data
tokenize.fit_on_texts(texts=final_data)

# Convert the text data into sequences of integers based on the tokenizer's vocabulary
index_data = tokenize.texts_to_sequences(final_data)

# Get the word index from the tokenizer
word_index = tokenize.word_index

# Print the number of unique words in the vocabulary
print(len(word_index))

# Pad the sequences to a maximum length of 50
pad_sequences = sequence.pad_sequences(index_data, maxlen=50)


steven colbert funy useing first amendment right freedom speech reality article ever going unlocked
steven colbert funy useing first amendment right freedom speech reality article ever going unlocked
146957


In [17]:
len(pad_sequences)

127656

In [18]:
# initializing classify model for binary classification
classifier = classify(2,200,len(word_index)+1,12,2)

In [19]:
from torch.utils.data import DataLoader,SequentialSampler,TensorDataset,RandomSampler

In [20]:
# Create a PyTorch TensorDataset from the padded sequences and labels
dataset = torch.utils.data.TensorDataset(torch.LongTensor(pad_sequences), torch.LongTensor(labels))

In [21]:
# Create a DataLoader for batching the data
batch_data = DataLoader(batch_size=128,dataset = dataset, sampler = SequentialSampler(dataset))

In [22]:
# declare optimizer 
from torch.autograd import no_grad,Variable
from torch.optim import Optimizer
from torch.optim import Adam
optimizer = Adam(classifier.parameters(),lr=0.003)

In [23]:
# Convert x_test to a NumPy array
x_test = np.array(x_test)

# Convert the NumPy array back into a Python list
x_test = list(x_test)

# Convert y_test to a NumPy array
y_test = np.array(y_test)

# Convert the NumPy array back into a Python list
y_test = list(y_test)

# Create a test_data list that is a reference to x_test
test_data = x_test

# Initialize an empty list called final_data_test to store modified text data
final_data_test = []

# Import the regular expression (re) module
import re

# Iterate through each data element in test_data
for data in test_data:
    # Remove all digits (0-9) from the data using regular expressions
    new = re.sub('[0-9]', '', str(data))
    # Append the modified data to the final_data_test list
    final_data_test.append(new)

# Print the first element of the final_data_test list
print(final_data_test[0])

# Import the text and sequence modules from TensorFlow's Keras preprocessing library
from tensorflow.keras.preprocessing import text, sequence

# Initialize a Tokenizer
tokenize = text.Tokenizer()

# Fit the tokenizer on the text data in final_data_test
tokenize.fit_on_texts(texts=final_data_test)

# Convert the text data into sequences of integers based on the tokenizer's vocabulary
index_data_test = tokenize.texts_to_sequences(final_data_test)

# Get the word index from the tokenizer
word_index_test = tokenize.word_index

# Print the number of unique words in the vocabulary
print(len(word_index_test))

# Pad the sequences to a maximum length of 50
pad_sequences_test = sequence.pad_sequences(index_data_test, maxlen=50)

# Create a PyTorch dataset from the padded sequences and labels
test_dataset = TensorDataset(torch.LongTensor(pad_sequences_test), torch.LongTensor(y_test))

# Create a data loader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=128, sampler=RandomSampler(test_dataset))

# Iterate through the test loader and print the first batch of data
for test in test_loader:
    print(test)
    break


huh yeah garbage flying spaghetti monster thing cleveland plain dealer page hypocritical blowhard
66388
[tensor([[    0,     0,     0,  ...,   259, 29692, 12390],
        [    0,     0,     0,  ...,     1,   310,   334],
        [    0,     0,     0,  ...,   612,  2881,   395],
        ...,
        [    0,     0,     0,  ..., 13994,     9,   875],
        [15084, 14226,   160,  ...,   857,  3178,     4],
        [    0,     0,     0,  ...,    36,   262,    69]]), tensor([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0])]


In [24]:
# set scheduler for learning rate, for that calculate total steps
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import RandomSampler
epochs = 40 # researchers suggest to take epochs should be in range of 4-7 for fine tuning pretrained model as we have concern 
# just for last layer which is untrained classification layer.
total_steps = len(batch_data) * epochs
# scheduler take care of linear schedule of learning rate 
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

i trained model over 200 epoch but it's so slow in training 
because i use cpu not gpu so i tried in 12 epoch and notice the accuracy
and it's about 41.48%

In [None]:
# Training
epochs = 200
final_loss = []       # List to store the training loss for each epoch
output = []           # List to store the model's outputs during testing
testing_accuracy = [] # List to store the testing accuracy for each epoch

# Loop over a specified number of training epochs
for epoc in range(epochs):
    print('epoch-', epoc)
    total_loss = 0
    classifier.train()  # Set the model to training mode

    # Loop over batches in the training data
    for step, batch in enumerate(batch_data):
        classifier.zero_grad()  # Zero out gradients
        loss, outputs = classifier.forward(x=(batch[0].to(device)), y=(batch[1].to(device)))  # Forward pass
        
        loss.backward()  # Backpropagate gradients
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0)  # Clip gradients to prevent exploding gradients
        optimizer.step()  # Update model parameters
        scheduler.step()  # Adjust learning rate using scheduler

    avg_loss = total_loss / len(batch_data)  # Calculate average loss for this epoch
    print(avg_loss)
    final_loss.append(avg_loss)  # Store the training loss for this epoch

    # Validation
    classifier.eval()  # Set the model to evaluation mode
    test_accuracy = 0

    # Loop over batches in the test data
    for step, batch_t in enumerate(test_loader):
        with torch.no_grad():
            outputs = classifier.forward((batch_t[0].to(device)), y=(batch_t[1].to(device)))  # Forward pass
            predictions = outputs[1]
            # Calculate F1-score as a measure of testing accuracy
            test_accuracy += f1_score(y_pred=np.argmax(predictions.cpu().detach().numpy(), axis=1),
                                      y_true=batch_t[1].cpu().detach().numpy())

            output.append(predictions)  # Store the model's outputs

    avg_accuracy = test_accuracy / len(test_loader)  # Calculate average testing accuracy for this epoch
    testing_accuracy.append(avg_accuracy)  # Store the testing accuracy for this epoch


epoch- 0
0.4164026794966333
epoch- 1
0.4148671244213242
epoch- 2
0.4148362150830114
epoch- 3
0.4148278334038052
epoch- 4
0.4148245065329309
epoch- 5
0.41482293157873745
epoch- 6
0.41482211948038344
epoch- 7
0.41482169311008377
epoch- 8
0.4148214282038694
epoch- 9
0.41482126733702507
epoch- 10
0.4148212471801437
epoch- 11
0.4148211875754989
epoch- 12


please give me feedback about my faults and how to optimise my code 