In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
from transformers import AdamW
import pandas as pd
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import csv
import re
import validators
import emoji
import unidecode
import nltk
import pickle
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Set the seed for reproducibility
SEED = 1235
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# BERT Hyperparameters (ADDITION)
n_bert_layers = 16  # Assuming the base model has 12 layers
bert_lr = 0.001
pooling_strategy = 'cls'  # Options: 'cls', 'mean', 'max'
bert_hidden_size = 768  # Adjust based on your BERT model
max_seq_length = 128
fine_tune_strategy = 'full'  # Options: 'full', 'last_layer'
bert_dropout = 0.9  # Adjust based on BERT model specifications

max_seq_length = 128  # This should match the max_seq_length used in BERT model
padding_strategy = 'max_length'  # Options: 'max_length', 'do_not_pad', 'longest'
truncation_strategy = 'longest_first'  # Options: 'longest_first', 'only_first', 'only_second'
do_lower_case = True  # Set to False if using a cased model

config = BertConfig(
    num_hidden_layers=n_bert_layers,
    hidden_size=bert_hidden_size,
    num_attention_heads=24,  # Assuming 12 attention heads
    intermediate_size=4 * bert_hidden_size,  # Default value in BERT
    hidden_dropout_prob=bert_dropout,
    attention_probs_dropout_prob=bert_dropout,
)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          max_length=max_seq_length,
                                          padding=padding_strategy,
                                          truncation=truncation_strategy,
                                          do_lower_case=do_lower_case)
# Load the BERT model with the custom configuration
bert_model = BertModel(config=config)


In [3]:
data_path = 'C:/Users/Mai/thesis/ch 4/dataset.csv'
data_df = pd.read_csv(data_path)
data_df = data_df.rename(columns={'Tweet Content': 'text', 'Label': 'label'})

In [4]:
data_df.head()

Unnamed: 0,text,Sentiment,label
0,Worst Bong ever. https://t.co/QA7R8VYppC,Negative,Hate
1,what i dont like about leni robredo's platform...,Negative,Hate
2,Ito ang tunay na survey ni VP Leni Robredo #1 ...,Negative,Hate
3,(3) BBM sued for Pork Barrel Scam\n\nhttps://t...,Negative,Hate
4,Sabog din sumagot tong si Norberto Gonzales no...,Negative,Hate


In [5]:
groupedby_sentiment = data_df.groupby(data_df.Sentiment)
data_df_positive = groupedby_sentiment.get_group("Positive")
data_df_positive

Unnamed: 0,text,Sentiment,label
2560,I took The Blind Test and my top candidates ar...,Positive,Non-hate
2561,"""True leader show up and man up."" - VP Leni Ro...",Positive,Non-hate
2562,Leni Robredo for president cutie 🤞🌸,Positive,Non-hate
2563,"Ako si Christian Tan, kabataan at kaisa ni Bon...",Positive,Non-hate
2564,Ate @xlykable Let’s support VP Leni and Sen. K...,Positive,Non-hate
...,...,...,...
3835,Just because Aiai did not supported Leni Robre...,Positive,Non-hate
3836,"“Mga kababayan, summon the warrior in you and ...",Positive,Non-hate
3837,@thekiarasworld Now I know that not all of the...,Positive,Non-hate
3838,Ping Lacson Ang may Plano sa bansa\n\n#KayPing...,Positive,Non-hate


In [6]:
data_df_negative = groupedby_sentiment.get_group("Negative")
data_df_negative

Unnamed: 0,text,Sentiment,label
0,Worst Bong ever. https://t.co/QA7R8VYppC,Negative,Hate
1,what i dont like about leni robredo's platform...,Negative,Hate
2,Ito ang tunay na survey ni VP Leni Robredo #1 ...,Negative,Hate
3,(3) BBM sued for Pork Barrel Scam\n\nhttps://t...,Negative,Hate
4,Sabog din sumagot tong si Norberto Gonzales no...,Negative,Hate
...,...,...,...
2555,Headline: The ambitious presidential candidate...,Negative,Hate
2556,"Norberto Gonzales is right, its a missed oppor...",Negative,Hate
2557,"The audacity to call Leni Robredo ""bobo"", "" ta...",Negative,Hate
2558,Bongbong Marcos is a Nazi. https://t.co/gY3xHb...,Negative,Hate


In [7]:
data_df_neutral = groupedby_sentiment.get_group("Neutral")
data_df_neutral

Unnamed: 0,text,Sentiment,label
3840,bongbong marcos dot com,Neutral,Non-hate
3841,Grabe pala talaga yung actions ni Leni Robredo...,Neutral,Non-hate
3842,"“Ngayong darating na halalan, ang tatanglaw sa...",Neutral,Non-hate
3843,For this COMELEC debate:\n\nValedictorian: Len...,Neutral,Non-hate
3844,Focus on the ball kakampinks\n\nPresident Leni...,Neutral,Non-hate
...,...,...,...
5115,President Leni Robredo and Vice President Kiko...,Neutral,Non-hate
5116,@jillrobredo 🌺🌺🌺\nthank you din kay @maraceped...,Neutral,Non-hate
5117,LOOK: Presidential candidate Bongbong Marcos m...,Neutral,Non-hate
5118,@itsmaxandcheese Leni Robredo for President 2022,Neutral,Non-hate


In [8]:
#binary hate non-hate
data_df_hate = data_df_negative.sample(n = 2560, replace=True)

data_df_positive = data_df_positive.sample(n = 1280, replace=True)
data_df_neutral = data_df_neutral.sample(n = 1280, replace=True)

data_df_nonhate = data_df_positive.append(data_df_neutral)

data_df = data_df_hate.append(data_df_nonhate)

  data_df_nonhate = data_df_positive.append(data_df_neutral)
  data_df = data_df_hate.append(data_df_nonhate)


In [9]:
data_df = data_df.drop(['Sentiment'], axis = 1)

In [10]:
#data_df.to_csv('dataset.csv', index=False)
data_df

Unnamed: 0,text,label
2520,@mmagnifyinglens @KlharkCao04 @iamguidodavid T...,Hate
2522,Why did the acting presidential spokesman Mart...,Hate
1491,Jusko Isko Moreno Domagoso what the fuck are y...,Hate
399,Isko Moreno calls Robredo 'fake leader with fa...,Hate
83,daughters of LENI ROBREDO SUPPORTS BBM/SARAH a...,Hate
...,...,...
4234,"Meanwhile, during a local caucus/ Unitea(m) ev...",Non-hate
4487,"Bongbong Marcos on corruption, publicizing SAL...",Non-hate
4394,Bongbong Marcos and Leni Robredo represent two...,Non-hate
4084,Ernie Abella is way better than Manny and Leni...,Non-hate


In [11]:
data_df = data_df.reset_index()

In [12]:
data_df

Unnamed: 0,index,text,label
0,2520,@mmagnifyinglens @KlharkCao04 @iamguidodavid T...,Hate
1,2522,Why did the acting presidential spokesman Mart...,Hate
2,1491,Jusko Isko Moreno Domagoso what the fuck are y...,Hate
3,399,Isko Moreno calls Robredo 'fake leader with fa...,Hate
4,83,daughters of LENI ROBREDO SUPPORTS BBM/SARAH a...,Hate
...,...,...,...
5115,4234,"Meanwhile, during a local caucus/ Unitea(m) ev...",Non-hate
5116,4487,"Bongbong Marcos on corruption, publicizing SAL...",Non-hate
5117,4394,Bongbong Marcos and Leni Robredo represent two...,Non-hate
5118,4084,Ernie Abella is way better than Manny and Leni...,Non-hate


In [13]:
# data_df = "dataset.csv"

In [14]:
# Preprocessing function
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:tokenizer.model_max_length - 2]  # Account for [CLS] and [SEP] tokens
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    return indexed_tokens

filipino_stopwords = set(
    """
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bababa
bago
bakit
bawat
bilang
dahil
dalawa
dapat
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
gusto
habang
hanggang
hindi
huwag
iba
ibaba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isa
isang
itaas
ito
iyo
iyon
iyong
ka
kahit
kailangan
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang
kapag
kapwa
karamihan
katiyakan
katulad
kaya
kaysa
ko
kong
kulang
kumuha
kung
laban
lahat
lamang
likod
lima
maaari
maaaring
maging
mahusay
makita
marami
marapat
masyado
may
mayroon
mga
minsan
mismo
mula
muli
na
nabanggit
naging
nagkaroon
nais
nakita
namin
napaka
narito
nasaan
ng
ngayon
ni
nila
nilang
nito
niya
niyang
noon
o
pa
paano
pababa
paggawa
pagitan
pagkakaroon
pagkatapos
palabas
pamamagitan
panahon
pangalawa
para
paraan
pareho
pataas
pero
pumunta
pumupunta
sa
saan
sabi
sabihin
sarili
sila
sino
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split()
)

# Date De-Identification
def remove_mentions(text):
    mention_pattern = re.compile(r'@\w+')
    
    # Use re.sub to remove mentions
    cleaned_text = mention_pattern.sub('', text)
    
    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(cleaned_text.split())

    return cleaned_text

# URL Removal
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    # Use re.sub to remove URLs
    cleaned_text = url_pattern.sub('', text)
    
    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(cleaned_text.split())

    return cleaned_text

# Special Characters Removal
def remove_special_characters(text):
    text = emoji.replace_emoji(text, replace="[emoji]")
    
    # Split the text into words
    words = text.split(" ")
    
    # Initialize an empty string to store the cleaned text
    cleaned_text = ""
    
    # Iterate through each word
    for word in words:
        # Check if the word contains only special characters or "[emoji]"
        if not (re.match(r"^[_\W]+$", word) or "[emoji]" in word):
            if len(cleaned_text) == 0:
                cleaned_text = f"{word}"
            else:
                cleaned_text = f"{cleaned_text} {word}"
                
    # Remove diacritics
    text_no_diacritics = unidecode.unidecode(cleaned_text)

    # Split the text into words
    sentence = text_no_diacritics.split(" ")
    output = ""

    # Remove special characters and numerics
    for part in sentence:
        part = re.sub("[^A-Za-z ]+$", "", part)
        part = re.sub("^[^A-Za-z #]+", "", part)
        if not (len(part) <= 1 or re.match(r"[^a-zA-Z#]", part)):
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"

    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(output.split())

    return cleaned_text

# Remove English Stop Words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

# Remove English Stop Words
def remove_english_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

# Remove Filipino Stop Words
def remove_filipino_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in filipino_stopwords]
    
    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(filtered_words)
  
    return cleaned_text

# Candidate Name Removal
def remove_candidate_names(text):
    candidatelist = "leni robredo bongbong marcos isko moreno domagoso manny pacman pacquiao ping lacson ernie abella leody de guzman norberto gonzales jose montemayor jr faisal mangondato"
    candidatelist = candidatelist.split()
    candidate_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, candidatelist)) + r')\b', re.IGNORECASE)
    
    # Use re.sub to remove candidate names
    cleaned_text = candidate_pattern.sub('', text)
    
    # Remove extra spaces and strip leading/trailing spaces
    cleaned_text = ' '.join(cleaned_text.split())
        
    return cleaned_text

# Hashtag Removal
def remove_hashtags(text):
    # Split the text into words
    words = text.split()
    
    # Initialize an empty list to store cleaned words
    cleaned_words = []
    
    for word in words:
        # Check if the word is a hashtag (starts with #)
        if not word.startswith('#'):
            cleaned_words.append(word)
    
    # Join the cleaned words into a single string
    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [15]:
#PreProcessing

In [16]:
# Data De-Identification
data_df['text'] = data_df['text'].apply(remove_mentions)

for i in range(10):
    text = data_df["text"][i]
    label = data_df["label"][i]

    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

Text:  This is such BS to attribute this to Marcos when a lot of claims that Bobo Marcos didn't actually work. He was called the absentee governor because he barely showed up. https://t.co/02v8EHUBH1 https://t.co/4VbpH8DjTZ 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Why did the acting presidential spokesman Martin Andanar interview Bongbong Marcos, a presidential aspirant, using people's tax money and resources? This is not the job of the office of the president. So blatant. https://t.co/oqPgxvEHpT 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Jusko Isko Moreno Domagoso what the fuck are you saying? 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Isko Moreno calls Robredo 'fake leader with fake color' for running as independent bet in #Halalan2022 presidential race https://t.co/TVZ5velRy0 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  daughters of LENI ROBREDO SUPPORTS BB

In [17]:
# URL Removal
data_df['text'] = data_df['text'].apply(remove_url)

for i in range(10):
    text = data_df["text"][i]
    label = data_df["label"][i]

    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

Text:  This is such BS to attribute this to Marcos when a lot of claims that Bobo Marcos didn't actually work. He was called the absentee governor because he barely showed up. 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Why did the acting presidential spokesman Martin Andanar interview Bongbong Marcos, a presidential aspirant, using people's tax money and resources? This is not the job of the office of the president. So blatant. 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Jusko Isko Moreno Domagoso what the fuck are you saying? 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Isko Moreno calls Robredo 'fake leader with fake color' for running as independent bet in #Halalan2022 presidential race 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  daughters of LENI ROBREDO SUPPORTS BBM/SARAH after realizing their MOM is really STUPID! 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [18]:
# Special Characters Removal
data_df['text'] = data_df['text'].apply(remove_special_characters)

for i in range(10):
    text = data_df["text"][i]
    label = data_df["label"][i]

    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

Text:  This is such BS to attribute this to Marcos when lot of claims that Bobo Marcos didn't actually work He was called the absentee governor because he barely showed up 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Why did the acting presidential spokesman Martin Andanar interview Bongbong Marcos presidential aspirant using people's tax money and resources This is not the job of the office of the president So blatant 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Jusko Isko Moreno Domagoso what the fuck are you saying 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  Isko Moreno calls Robredo fake leader with fake color for running as independent bet in #Halalan presidential race 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  daughters of LENI ROBREDO SUPPORTS BBM/SARAH after realizing their MOM is really STUPID 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [19]:
# Lowercase
data_df['text'] = data_df['text'].str.lower()

for i in range(10):
    text = data_df["text"][i]
    label = data_df["label"][i]

    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

Text:  this is such bs to attribute this to marcos when lot of claims that bobo marcos didn't actually work he was called the absentee governor because he barely showed up 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  why did the acting presidential spokesman martin andanar interview bongbong marcos presidential aspirant using people's tax money and resources this is not the job of the office of the president so blatant 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  jusko isko moreno domagoso what the fuck are you saying 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  isko moreno calls robredo fake leader with fake color for running as independent bet in #halalan presidential race 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  daughters of leni robredo supports bbm/sarah after realizing their mom is really stupid 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [20]:
# Remove English Stop Words
# data_df['text'] = data_df['text'].apply(remove_english_stopwords)

# for i in range(10):
#    text = data_df["text"][i]
#    label = data_df["label"][i]

#    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

In [21]:
# Remove Filipino Stop Words
# data_df['text'] = data_df['text'].apply(remove_filipino_stopwords)

# for i in range(10):
#    text = data_df["text"][i]
#    label = data_df["label"][i]

#    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

In [22]:
# Remove Candidate Names
# data_df['text'] = data_df['text'].apply(remove_candidate_names)

# for i in range(10):
    # text = data_df["text"][i]
    # label = data_df["label"][i]

    # print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

In [23]:
# Remove Hashtags
data_df['text'] = data_df['text'].apply(remove_hashtags)

for i in range(10):
    text = data_df["text"][i]
    label = data_df["label"][i]

    print('Text: ', text, "\n\nLabel: ", label, "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

Text:  this is such bs to attribute this to marcos when lot of claims that bobo marcos didn't actually work he was called the absentee governor because he barely showed up 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  why did the acting presidential spokesman martin andanar interview bongbong marcos presidential aspirant using people's tax money and resources this is not the job of the office of the president so blatant 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  jusko isko moreno domagoso what the fuck are you saying 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  isko moreno calls robredo fake leader with fake color for running as independent bet in presidential race 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text:  daughters of leni robredo supports bbm/sarah after realizing their mom is really stupid 

Label:  Hate 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Text

In [24]:
data_df.to_csv('binary10.csv', index=False)

In [25]:
data_df

Unnamed: 0,index,text,label
0,2520,this is such bs to attribute this to marcos wh...,Hate
1,2522,why did the acting presidential spokesman mart...,Hate
2,1491,jusko isko moreno domagoso what the fuck are y...,Hate
3,399,isko moreno calls robredo fake leader with fak...,Hate
4,83,daughters of leni robredo supports bbm/sarah a...,Hate
...,...,...,...
5115,4234,meanwhile during local caucus unitea(m event s...,Non-hate
5116,4487,bongbong marcos on corruption publicizing saln...,Non-hate
5117,4394,bongbong marcos and leni robredo represent two...,Non-hate
5118,4084,ernie abella is way better than manny and leni,Non-hate


In [26]:
data_df['text'] = data_df['text'].apply(preprocess_text)

In [27]:
data_df

Unnamed: 0,index,text,label
0,2520,"[2023, 2003, 2107, 18667, 2000, 17961, 2023, 2...",Hate
1,2522,"[2339, 2106, 1996, 3772, 4883, 14056, 3235, 19...",Hate
2,1491,"[18414, 21590, 2003, 3683, 17921, 14383, 23692...",Hate
3,399,"[2003, 3683, 17921, 4455, 6487, 23417, 8275, 3...",Hate
4,83,"[5727, 1997, 18798, 2072, 6487, 23417, 6753, 2...",Hate
...,...,...,...
5115,4234,"[5564, 2076, 2334, 13965, 15908, 2050, 1006, 1...",Non-hate
5116,4487,"[14753, 18259, 5063, 14810, 2006, 7897, 2270, ...",Non-hate
5117,4394,"[14753, 18259, 5063, 14810, 1998, 18798, 2072,...",Non-hate
5118,4084,"[14637, 16768, 2721, 2003, 2126, 2488, 2084, 1...",Non-hate


In [28]:
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=SEED)

In [29]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dropout = nn.Dropout(dropout) 
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=256, kernel_size=2, padding='same')
        self.pool1 = nn.MaxPool1d(2, 2)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=64, kernel_size=4, padding='same')
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(64, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        x = embedded.permute(0, 2, 1)  # Change the dimensions for convolution
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.global_pooling(x).squeeze(2)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [30]:
# Set up iterators
BATCH_SIZE = 64

In [31]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, max_seq_length):
        self.data = dataframe
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']

        # Padding and conversion to tensor
        padded_text = torch.tensor(text[:self.max_seq_length] + [0] * (self.max_seq_length - len(text)))
        return padded_text, label

In [32]:
train_dataset = TextDataset(train_df, 1000)
test_dataset = TextDataset(test_df, 1000)

In [33]:
train_iterator = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_iterator = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [34]:
# Define model hyperparameters
VOCAB_SIZE = tokenizer.vocab_size
EMBEDDING_DIM = 768
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5

# CNN Hyperparameters
hidden_dim = 100
n_conv_layers = 1
kernel_sizes = [2, 3, 4]
activation = nn.ReLU()

In [35]:
# Initialize CNN model
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
#Initialize CNN model
# model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, hidden_dim, n_conv_layers, kernel_sizes, activation)

In [36]:
# Initialize BERT model (for embedding extraction)
bert_model.eval()  # Set to evaluation mode

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.9, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-15): 16 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.9, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.9, inplace=False)
  

In [37]:
with torch.no_grad():
    for i, token in enumerate(tokenizer.get_vocab()):
        token_id = tokenizer.convert_tokens_to_ids(token)
        token_embedding = bert_model.embeddings.word_embeddings.weight[token_id]
        model.embedding.weight[i].data.copy_(token_embedding)

bert_parameters = []
for layer in bert_model.encoder.layer:
    bert_parameters.extend(layer.parameters())

# Create AdamW optimizer with custom hyperparameters for BERT embeddings
bert_learning_rate = 2e-4  # Adjust as needed
bert_optimizer = optim.AdamW(bert_parameters, lr=bert_learning_rate)

In [38]:
data_df

Unnamed: 0,index,text,label
0,2520,"[2023, 2003, 2107, 18667, 2000, 17961, 2023, 2...",Hate
1,2522,"[2339, 2106, 1996, 3772, 4883, 14056, 3235, 19...",Hate
2,1491,"[18414, 21590, 2003, 3683, 17921, 14383, 23692...",Hate
3,399,"[2003, 3683, 17921, 4455, 6487, 23417, 8275, 3...",Hate
4,83,"[5727, 1997, 18798, 2072, 6487, 23417, 6753, 2...",Hate
...,...,...,...
5115,4234,"[5564, 2076, 2334, 13965, 15908, 2050, 1006, 1...",Non-hate
5116,4487,"[14753, 18259, 5063, 14810, 2006, 7897, 2270, ...",Non-hate
5117,4394,"[14753, 18259, 5063, 14810, 1998, 18798, 2072,...",Non-hate
5118,4084,"[14637, 16768, 2721, 2003, 2126, 2488, 2084, 1...",Non-hate


In [39]:
#optimizer = optim.Adam(model.parameters())
# Your custom hyperparameters
learning_rate = 0.001
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-08
weight_decay = 0.0

# Create Adam optimizer with custom hyperparameters
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()

In [40]:
# Train function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, iterator):
    model.train()
    epoch_loss = 0
    for text_batch, label_batch in iterator:
        # Extract text sequences from the text_batch tensor
        texts = text_batch
        
        # Extract and process labels
        labels = [1 if label == 'Hate' else 0 for label in label_batch]  # Example conversion
        
        texts = texts.to(device)  # Move to device if needed
        labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Convert to tensor
        
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


# Evaluate function
def evaluate(model, iterator):
    model.eval()
    epoch_loss = 0
    predicted_labels = []
    true_labels = []  # Declare the true_labels list
    
    with torch.no_grad():
        for text_batch, label_batch in iterator:
            texts = text_batch  # Extract text sequences
            labels = [1 if label == 'Hate' else 0 for label in label_batch]  # Example conversion
            
            texts = texts.to(device)  # Move to device
            labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Convert to tensor
            
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            
            epoch_loss += loss.item()
            predicted_labels.extend(torch.round(torch.sigmoid(predictions)).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy, f1, precision, recall
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    
    return epoch_loss / len(iterator), accuracy, f1, precision, recall

In [41]:
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator)
    test_loss, accuracy, f1, precision, recall = evaluate(model, test_iterator)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    print(f'\tAccuracy: {accuracy:.4f} | F1-Score: {f1:.4f}')
    print(f'\tPrecision: {precision:.4f} | Recall: {recall:.4f}')

  return F.conv1d(input, weight, bias, self.stride,


Epoch: 01
	Train Loss: 0.557
	Test Loss: 0.328
	Accuracy: 0.8691 | F1-Score: 0.8694
	Precision: 0.8544 | Recall: 0.8849
Epoch: 02
	Train Loss: 0.292
	Test Loss: 0.267
	Accuracy: 0.8926 | F1-Score: 0.8924
	Precision: 0.8803 | Recall: 0.9048
Epoch: 03
	Train Loss: 0.156
	Test Loss: 0.244
	Accuracy: 0.9160 | F1-Score: 0.9152
	Precision: 0.9098 | Recall: 0.9206
Epoch: 04
	Train Loss: 0.096
	Test Loss: 0.282
	Accuracy: 0.9199 | F1-Score: 0.9201
	Precision: 0.9042 | Recall: 0.9365
Epoch: 05
	Train Loss: 0.054
	Test Loss: 0.319
	Accuracy: 0.9277 | F1-Score: 0.9270
	Precision: 0.9216 | Recall: 0.9325
Epoch: 06
	Train Loss: 0.033
	Test Loss: 0.401
	Accuracy: 0.9180 | F1-Score: 0.9180
	Precision: 0.9038 | Recall: 0.9325
Epoch: 07
	Train Loss: 0.031
	Test Loss: 0.357
	Accuracy: 0.9258 | F1-Score: 0.9249
	Precision: 0.9213 | Recall: 0.9286
Epoch: 08
	Train Loss: 0.027
	Test Loss: 0.357
	Accuracy: 0.9258 | F1-Score: 0.9252
	Precision: 0.9180 | Recall: 0.9325
Epoch: 09
	Train Loss: 0.019
	Test Loss: