In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
from transformers import AdamW
import pandas as pd
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import csv
import re
import validators
import emoji
import unidecode
import nltk
import pickle
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Mark Gabriel
[nltk_data]     Ortiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Set the seed for reproducibility
SEED = 1235
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# BERT Hyperparameters (ADDITION)
n_bert_layers = 16  # Assuming the base model has 12 layers
bert_lr = 0.001
pooling_strategy = 'cls'  # Options: 'cls', 'mean', 'max'
bert_hidden_size = 768  # Adjust based on your BERT model
max_seq_length = 128
fine_tune_strategy = 'last_layer'  # Options: 'full', 'last_layer'
bert_dropout = 0.9  # Adjust based on BERT model specifications

max_seq_length = 128  # This should match the max_seq_length used in BERT model
padding_strategy = 'max_length'  # Options: 'max_length', 'do_not_pad', 'longest'
truncation_strategy = 'longest_first'  # Options: 'longest_first', 'only_first', 'only_second'
do_lower_case = True  # Set to False if using a cased model

config = BertConfig(
    num_hidden_layers=n_bert_layers,
    hidden_size=bert_hidden_size,
    num_attention_heads=12,  # Assuming 12 attention heads
    intermediate_size=4 * bert_hidden_size,  # Default value in BERT
    hidden_dropout_prob=bert_dropout,
    attention_probs_dropout_prob=bert_dropout,
)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                          max_length=max_seq_length,
                                          padding=padding_strategy,
                                          truncation=truncation_strategy,
                                          do_lower_case=do_lower_case)
# Load the BERT model with the custom configuration
bert_model = BertModel(config=config)


In [3]:
data_path = 'dataset.csv'
data_df = pd.read_csv(data_path)
data_df = data_df.rename(columns={'Tweet Content': 'text', 'Label': 'label'})

In [4]:
data_df.head()

Unnamed: 0,text,Sentiment,label
0,KULANG ATA SA TULOG SI NORBERTO GONZALES HAHAH...,Negative,Hate
1,Hirosii's argument screams misogyny not valid ...,Negative,Hate
2,"Who is the better vote? Bongbong #marcos, who ...",Negative,Hate
3,@skzoowifey hindi siya ang tatay niya bc from ...,Negative,Hate
4,"@Pontifex @LaityFamilyLife Bongbong Marcos,Sar...",Negative,Hate


In [5]:
groupedby_sentiment = data_df.groupby(data_df.Sentiment)
data_df_positive = groupedby_sentiment.get_group("Positive")
data_df_positive

Unnamed: 0,text,Sentiment,label
2400,"It's just so fitting for Leni Robredo, as an h...",Positive,Non-hate
2401,The Solution.\n\n* Leni Robredo - Philippines ...,Positive,Non-hate
2402,@kikopangilinan @donny @bellemariano02 So prou...,Positive,Non-hate
2403,"Ako si Bernard isang guro,nakiki-isa sa pagsup...",Positive,Non-hate
2404,"@DonKissPlatum Para sa Bayan, Para sa Pagbabag...",Positive,Non-hate
...,...,...,...
3595,@GelSantosRelos @lenirobredo @cnnphilippines i...,Positive,Non-hate
3596,Wala akong duda na sa ating sama-samang pagkil...,Positive,Non-hate
3597,The most qualified president\nPing Lacson lang...,Positive,Non-hate
3598,"""Sa Gobyernong Tapat, Angat Buhay Lahat! At si...",Positive,Non-hate


In [6]:
data_df_negative = groupedby_sentiment.get_group("Negative")
data_df_negative

Unnamed: 0,text,Sentiment,label
0,KULANG ATA SA TULOG SI NORBERTO GONZALES HAHAH...,Negative,Hate
1,Hirosii's argument screams misogyny not valid ...,Negative,Hate
2,"Who is the better vote? Bongbong #marcos, who ...",Negative,Hate
3,@skzoowifey hindi siya ang tatay niya bc from ...,Negative,Hate
4,"@Pontifex @LaityFamilyLife Bongbong Marcos,Sar...",Negative,Hate
...,...,...,...
2395,Absent for today’s videyow:\n1. Bongbong Marco...,Negative,Hate
2396,@DisguisedPost @_Nathalieperona Mawalang galan...,Negative,Hate
2397,"Get ready for 'BongBong' Marcos!\n\nHa, what a...",Negative,Hate
2398,Sabi ng iba bobo daw si Leni. Bobo pa siya sa ...,Negative,Hate


In [7]:
data_df_neutral = groupedby_sentiment.get_group("Neutral")
data_df_neutral

Unnamed: 0,text,Sentiment,label
3600,"In front of a huge crowd, Cebu Governor Gwen G...",Neutral,Non-hate
3601,ANSABE NG 6 MILLION VIEWS SA KATATAPOS LANG NA...,Neutral,Non-hate
3602,Mga di nagdidiet for Leni Robredo CHAROTTT,Neutral,Non-hate
3603,Madam President ackkkk keleg iz meh\nPresident...,Neutral,Non-hate
3604,👀 Bongbong yo-yo 🇵🇭 \n\n#Marcos 💸 https://t.co...,Neutral,Non-hate
...,...,...,...
4795,President Leni Robredo supporters 💚✅☘️🌲🌴🥑🌿📚🥦🥒🍏...,Neutral,Non-hate
4796,"gonna start muting vp leni pics for now, my dr...",Neutral,Non-hate
4797,"Sa Gobyernong Tapat, Angat Buhay Lahat! Ang Pr...",Neutral,Non-hate
4798,Ang presidente... Leni Robredo\nBise President...,Neutral,Non-hate


In [8]:
#binary hate non-hate
data_df_hate = data_df_negative.sample(n = 2560, replace=True)

data_df_positive = data_df_positive.sample(n = 1280, replace=True)
data_df_neutral = data_df_neutral.sample(n = 1280, replace=True)

data_df_nonhate = data_df_positive.append(data_df_neutral)

data_df = data_df_hate.append(data_df_nonhate)

  data_df_nonhate = data_df_positive.append(data_df_neutral)
  data_df = data_df_hate.append(data_df_nonhate)


In [9]:
data_df = data_df.drop(['Sentiment'], axis = 1)

In [10]:
#data_df.to_csv('dataset.csv', index=False)
data_df

Unnamed: 0,text,label
464,"Kung ako man ang anak ni Loren Legarda, ikahih...",Hate
1155,"@frfielpareja Fr. Fiel Pareja, Katoliko po ako...",Hate
1345,We were just talking about your leni robredo a...,Hate
324,"Ganyan dapat ang debate, brain challenging. Ka...",Hate
2038,Kawawa talaga mga #Kakampikon Popular Celebrit...,Hate
...,...,...
3808,“robredos” HAHSHSHA SI LENI ROBREDO NA NGA LAN...,Non-hate
4475,"Earlier in the election cycle, it's only Isko ...",Non-hate
3861,"""Sa Gobyernong Tapat, Angat Buhay Lahat!""\n\nA...",Non-hate
3985,The way #VoguePhilippines will have President ...,Non-hate


In [11]:
data_df = data_df.reset_index()

In [12]:
data_df

Unnamed: 0,index,text,label
0,464,"Kung ako man ang anak ni Loren Legarda, ikahih...",Hate
1,1155,"@frfielpareja Fr. Fiel Pareja, Katoliko po ako...",Hate
2,1345,We were just talking about your leni robredo a...,Hate
3,324,"Ganyan dapat ang debate, brain challenging. Ka...",Hate
4,2038,Kawawa talaga mga #Kakampikon Popular Celebrit...,Hate
...,...,...,...
5115,3808,“robredos” HAHSHSHA SI LENI ROBREDO NA NGA LAN...,Non-hate
5116,4475,"Earlier in the election cycle, it's only Isko ...",Non-hate
5117,3861,"""Sa Gobyernong Tapat, Angat Buhay Lahat!""\n\nA...",Non-hate
5118,3985,The way #VoguePhilippines will have President ...,Non-hate


In [13]:
# data_df = "dataset.csv"

In [14]:
# Preprocessing function
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:tokenizer.model_max_length - 2]  # Account for [CLS] and [SEP] tokens
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    return indexed_tokens

filipino_stopwords = set(
    """
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bababa
bago
bakit
bawat
bilang
dahil
dalawa
dapat
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
gusto
habang
hanggang
hindi
huwag
iba
ibaba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isa
isang
itaas
ito
iyo
iyon
iyong
ka
kahit
kailangan
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang
kapag
kapwa
karamihan
katiyakan
katulad
kaya
kaysa
ko
kong
kulang
kumuha
kung
laban
lahat
lamang
likod
lima
maaari
maaaring
maging
mahusay
makita
marami
marapat
masyado
may
mayroon
mga
minsan
mismo
mula
muli
na
nabanggit
naging
nagkaroon
nais
nakita
namin
napaka
narito
nasaan
ng
ngayon
ni
nila
nilang
nito
niya
niyang
noon
o
pa
paano
pababa
paggawa
pagitan
pagkakaroon
pagkatapos
palabas
pamamagitan
panahon
pangalawa
para
paraan
pareho
pataas
pero
pumunta
pumupunta
sa
saan
sabi
sabihin
sarili
sila
sino
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split()
)

# from nltk.corpus import stopwords
# english_stopwords = stopwords.words('english')

# search = "leni robredo bongbong marcos isko moreno domagoso manny pacman pacquiao ping lacson ernie abella leody de guzman norberto gonzales jose montemayor jr faisal mangondato"
# candidatelist = search.split(" ")

# URL Removal
def remove_url (text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    # Use re.sub to replace URLs with an empty string
    return url_pattern.sub('', text)

# Emoji Removal
def replace_emojis(text):
    return emoji.replace_emoji(text, "")

# Remove Diacritics
def remove_diacritics(text):
    return unidecode.unidecode(text)

# Remove English Stop Words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

def remove_english_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Remove Filipino Stop Words
def remove_filipino_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in filipino_stopwords]
    return ' '.join(filtered_words)

In [15]:
#PreProcessing

# URL Removal
data_df['text'] = data_df['text'].apply(remove_url)

# Emoji Removal
data_df['text'] = data_df['text'].apply(replace_emojis)

 # Lowercase
# data_df['text'] = data_df['text'].str.lower()

# Remove Diacritics
data_df['text'] = data_df['text'].apply(remove_diacritics)

# Remove symbols and numerics using regex
#data_df['text'] = data_df['text'].str.replace(r'[^A-Za-z\s#]', '', regex=True)
data_df['text'] = data_df['text'].str.replace(r'[^a-zA-Z0-9\s#!?]', '', regex=True)

# Remove English Stop Words
#data_df['text'] = data_df['text'].apply(remove_english_stopwords)

# Remove Filipino Stop Words
#data_df['text'] = data_df['text'].apply(remove_filipino_stopwords)

data_df.to_csv('sample1.csv', index=False)


data_df['text'] = data_df['text'].apply(preprocess_text)

In [16]:
data_df

Unnamed: 0,index,text,label
0,464,"[180, 4380, 170, 2718, 1299, 1126, 1403, 1126,...",Hate
1,1155,"[175, 11931, 10387, 17482, 21024, 1161, 175, 1...",Hate
2,1345,"[1195, 1127, 1198, 2520, 1164, 1240, 5837, 260...",Hate
3,324,"[176, 18266, 1389, 5358, 4163, 1204, 1126, 140...",Hate
4,2038,"[24181, 3624, 3624, 27629, 18974, 1161, 17713,...",Hate
...,...,...,...
5115,3808,"[187, 12809, 20792, 1116, 5871, 9524, 9524, 23...",Non-hate
5116,4475,"[2206, 1107, 1103, 1728, 5120, 1157, 1178, 111...",Non-hate
5117,3861,"[21718, 1301, 2665, 10449, 4553, 12999, 2980, ...",Non-hate
5118,3985,"[1103, 1236, 108, 191, 18597, 27008, 10913, 18...",Non-hate


In [17]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=SEED)

In [18]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dropout = nn.Dropout(dropout)
        
        # Reduce the number of filters in convolutional layers for example
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=2, padding='same')
        self.bn1 = nn.BatchNorm1d(128)  # Add Batch Normalization
        self.pool1 = nn.MaxPool1d(2, 2)
        
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=32, kernel_size=4, padding='same')
        self.bn2 = nn.BatchNorm1d(32)  # Add Batch Normalization
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        
        self.dropout = nn.Dropout(0.6)  # Increase dropout rate slightly
        self.fc = nn.Linear(32, output_dim)  # Change the number of input features to match changes in conv layers

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        x = embedded.permute(0, 2, 1)
        
        x = F.relu(self.conv1(x))
        x = self.bn1(x)  # Add Batch Normalization after activation
        x = self.pool1(x)
        
        x = F.relu(self.conv2(x))
        x = self.bn2(x)  # Add Batch Normalization after activation
        x = self.global_pooling(x).squeeze(2)
        
        x = self.dropout(x)
        x = self.fc(x)
        return x


In [19]:
# Set up iterators
BATCH_SIZE = 64

In [20]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, max_seq_length):
        self.data = dataframe
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']

        # Padding and conversion to tensor
        padded_text = torch.tensor(text[:self.max_seq_length] + [0] * (self.max_seq_length - len(text)))
        return padded_text, label

In [21]:
train_dataset = TextDataset(train_df, 1000)
test_dataset = TextDataset(test_df, 1000)

In [22]:
train_iterator = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_iterator = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [23]:
# Define model hyperparameters
VOCAB_SIZE = tokenizer.vocab_size
EMBEDDING_DIM = 768
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5

# CNN Hyperparameters
hidden_dim = 100
n_conv_layers = 1
kernel_sizes = [2, 3, 4]
activation = nn.ReLU()

In [24]:
# Initialize CNN model
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
#Initialize CNN model
# model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, hidden_dim, n_conv_layers, kernel_sizes, activation)

In [25]:
# Initialize BERT model (for embedding extraction)
bert_model.eval()  # Set to evaluation mode

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.9, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-15): 16 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.9, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.9, inplace=False)
  

In [26]:
with torch.no_grad():
    for i, token in enumerate(tokenizer.get_vocab()):
        token_id = tokenizer.convert_tokens_to_ids(token)
        token_embedding = bert_model.embeddings.word_embeddings.weight[token_id]
        model.embedding.weight[i].data.copy_(token_embedding)

bert_parameters = []
for layer in bert_model.encoder.layer:
    bert_parameters.extend(layer.parameters())

# Create AdamW optimizer with custom hyperparameters for BERT embeddings
bert_learning_rate = 2e-5  # Adjust as needed
bert_optimizer = optim.AdamW(bert_parameters, lr=bert_learning_rate)

In [27]:
data_df

Unnamed: 0,index,text,label
0,464,"[180, 4380, 170, 2718, 1299, 1126, 1403, 1126,...",Hate
1,1155,"[175, 11931, 10387, 17482, 21024, 1161, 175, 1...",Hate
2,1345,"[1195, 1127, 1198, 2520, 1164, 1240, 5837, 260...",Hate
3,324,"[176, 18266, 1389, 5358, 4163, 1204, 1126, 140...",Hate
4,2038,"[24181, 3624, 3624, 27629, 18974, 1161, 17713,...",Hate
...,...,...,...
5115,3808,"[187, 12809, 20792, 1116, 5871, 9524, 9524, 23...",Non-hate
5116,4475,"[2206, 1107, 1103, 1728, 5120, 1157, 1178, 111...",Non-hate
5117,3861,"[21718, 1301, 2665, 10449, 4553, 12999, 2980, ...",Non-hate
5118,3985,"[1103, 1236, 108, 191, 18597, 27008, 10913, 18...",Non-hate


In [28]:
#optimizer = optim.Adam(model.parameters())
# Your custom hyperparameters
learning_rate = 0.001
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-08
weight_decay = 0.0

# Create Adam optimizer with custom hyperparameters
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()

In [29]:
# Train function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, iterator):
    model.train()
    epoch_loss = 0
    for text_batch, label_batch in iterator:
        # Extract text sequences from the text_batch tensor
        texts = text_batch
        
        # Extract and process labels
        labels = [1 if label == 'Hate' else 0 for label in label_batch]  # Example conversion
        
        texts = texts.to(device)  # Move to device if needed
        labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Convert to tensor
        
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


# Evaluate function
def evaluate(model, iterator):
    model.eval()
    epoch_loss = 0
    predicted_labels = []
    true_labels = []  # Declare the true_labels list
    
    with torch.no_grad():
        for text_batch, label_batch in iterator:
            texts = text_batch  # Extract text sequences
            labels = [1 if label == 'Hate' else 0 for label in label_batch]  # Example conversion
            
            texts = texts.to(device)  # Move to device
            labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Convert to tensor
            
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            
            epoch_loss += loss.item()
            predicted_labels.extend(torch.round(torch.sigmoid(predictions)).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy, f1, precision, recall
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    
    return epoch_loss / len(iterator), accuracy, f1, precision, recall

In [30]:
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator)
    test_loss, accuracy, f1, precision, recall = evaluate(model, test_iterator)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTest Loss: {test_loss:.3f}')
    print(f'\tAccuracy: {accuracy:.4f} | F1-Score: {f1:.4f}')
    print(f'\tPrecision: {precision:.4f} | Recall: {recall:.4f}')

  return F.conv1d(input, weight, bias, self.stride,


Epoch: 01
	Train Loss: 1.568
	Test Loss: 0.581
	Accuracy: 0.6846 | F1-Score: 0.7498
	Precision: 0.6181 | Recall: 0.9528
Epoch: 02
	Train Loss: 0.793
	Test Loss: 0.381
	Accuracy: 0.8428 | F1-Score: 0.8511
	Precision: 0.8028 | Recall: 0.9055
Epoch: 03
	Train Loss: 0.518
	Test Loss: 0.312
	Accuracy: 0.8818 | F1-Score: 0.8874
	Precision: 0.8413 | Recall: 0.9390
Epoch: 04
	Train Loss: 0.322
	Test Loss: 0.283
	Accuracy: 0.9033 | F1-Score: 0.9049
	Precision: 0.8837 | Recall: 0.9272
Epoch: 05
	Train Loss: 0.201
	Test Loss: 0.288
	Accuracy: 0.9043 | F1-Score: 0.9061
	Precision: 0.8825 | Recall: 0.9311


KeyboardInterrupt: 