# Load modules

In [1]:
import re
import emoji
import time
import string
import os
import pickle

import torch
from tqdm.notebook import tqdm
import nltk

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertForSequenceClassification

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset for subjective data

In [2]:
# Load reddit dataset
reddit_df = pd.read_csv('Reddit_Data.csv',
                   names=['review', 'sentiment']).drop(0).reset_index(drop=True)

twitter_df = pd.read_csv('Twitter_Data.csv',
                         names=['review', 'sentiment']).drop(0).reset_index(drop=True)

df = pd.concat([reddit_df, twitter_df]).reset_index(drop=True)

# Remove neutral sentiments
df = df[df['sentiment'] != '0']
df['sentiment'].unique()

array(['1', '-1', nan], dtype=object)

In [3]:
df.isnull().sum()

review       2
sentiment    7
dtype: int64

In [4]:
# Drop null
df = df.dropna()
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [10]:
# Rename review column
df = df.rename(columns={"review": "sentence", "sentiment": "type"})
df.head()

Unnamed: 0,sentence,type
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
4,for your own benefit you may want read living ...,1
5,you should all sit down together and watch the...,-1


In [12]:
df['type'] = df['type'].replace(['1', '-1'], 'subjective')
df.head()


Unnamed: 0,sentence,type
0,family mormon have never tried explain them t...,subjective
1,buddhism has very much lot compatible with chr...,subjective
2,seriously don say thing first all they won get...,subjective
4,for your own benefit you may want read living ...,subjective
5,you should all sit down together and watch the...,subjective


In [29]:
# Get N rows for subjective dataset
N = 1529
subjective_df = df.sample(N).reset_index(drop=True)
subjective_df.head()

Unnamed: 0,sentence,type
0,lets compare work done modi varansi years and ...,subjective
1,would love bungie office when they see this p...,subjective
2,according the saubhagya portal many 582 house...,subjective
3,indian here you won hard about much from the i...,subjective
4,just back from after days extensive coverage c...,subjective


# Get objective data from Wikipedia

In [13]:
import wikipedia

def get_wiki_data(page_name):
    wikipedia.set_lang('en')
    try:
        if type(page_name) == int:
            page = wikipedia.page(pageid=page_name)
        else:
            page = wikipedia.page(page_name)

    except:
        print(f'Page {page_name} not found.')
        return None
    
    sections = re.split(r'\n==+', page.content)
    filtered_sections = []

    for section in sections:
        if 'See also ' in section.strip():
            break

        filtered_sections.append(section)
    filtered_text = ''.join(filtered_sections)

    r_header = r'\n ([^=]+)\s*==+\n'

    # Remove section headers from the text
    text = re.sub(r_header, '', filtered_text)
    text = re.sub('\n', '', text)
    text = re.sub('e.g.', 'for example', text)

    sentences = nltk.sent_tokenize(text)
    return sentences

In [21]:
page_names = [
    'Artificial intelligence',
    73291755,                                             # Gen AI
    72417803,                                             # ChatGPT
    'Gemini (chatbot)',
    'Transformer (deep learning architecture)',
    'Deep neural networks',
    73291899,                                             # AI boom
    'Stable diffusion',
    'Microsoft Copilot',
    'LLaMA',
    72861474,                                             # GPT-4
    'Hallucination (artificial intelligence)',
    'Artificial intelligence and copyright'
]

In [22]:
all_sentences = []
for page_name in page_names:
    sentences = get_wiki_data(page_name)
    if sentences is not None:
        all_sentences.extend(sentences)

In [23]:
len(all_sentences)

1529

In [26]:
obj = ['objective'] * len(all_sentences)
objective_df = pd.DataFrame(list(zip(all_sentences, obj)), columns=['sentence', 'type'])
objective_df.head()

Unnamed: 0,sentence,type
0,"Artificial intelligence (AI), in its broadest ...",objective
1,It is a field of research in computer science ...,objective
2,Such machines may be called AIs.AI technology ...,objective
3,Some high-profile applications include advance...,objective
4,"However, many AI applications are not perceive...",objective


In [27]:
objective_df.describe()

Unnamed: 0,sentence,type
count,1529,1529
unique,1527,1
top,"Languages tested were Polish, French, Korean, ...",objective
freq,2,1529


# Combine subjective and objective data together

In [33]:
# Combine and shuffle data
final_df = pd.concat([subjective_df, objective_df]).reset_index(drop=True)
final_df = final_df.sample(frac=1).reset_index(drop=True)

final_df.head()

Unnamed: 0,sentence,type
0,hatred for modi loss power once again has made...,subjective
1,Wikipedia was also one of the sources of ChatG...,objective
2,Recursive auto-encoders built atop word embedd...,objective
3,the vision our narendra modi translated realit...,subjective
4,"Reactions to the ceremony were mixed.In 2023, ...",objective


# Process data

In [34]:
def remove_not_ASCII(text):
    text = str(text)
    text = ''.join([word for word in text if word in string.printable])
#     print(text)
    return text

def replace_emoticons(text):
    text = text.replace("<3", "heart ")
    text = re.sub('>:-?\'?"?\(+', 'angry ', text)
    text = re.sub('\)+:-?\'?"?:<', 'angry ', text)
    text = re.sub(':-?\'?"?(o+|O+|0+)', 'surprised ', text)
    text = re.sub(':-?\'?"?(\)+|>+|D+)', 'smile ', text)
    text = re.sub('(\(+|<+)-?\'?"?:', 'smile ', text)
    text = re.sub(':-?\'?"?\(+', 'sad ', text)
    text = re.sub('(\)+|>+|D+)-?\'?"?:', 'sad ', text)
    
    return text

def text_preprocessing(text):
    # Clean dataset
    text = replace_emoticons(text)                           # convert emoticon to text
    text = emoji.demojize(text, delimiters=("", " "))        # convert emoji to text
    text = remove_not_ASCII(text)                            # remove non-ASCII characters

    text = re.sub('<br />', '', text)                        # remove <br />
    text = re.sub('^https?:\/\/S+', '', text)                # remove URLs
    
    text = re.sub('u/\S+', 'user', text)                     # replace user mentions
    text = re.sub('@\S+', 'user', text)
    text = re.sub('r/\S+', 'subreddit', text)                # replace subreddit mentions
    return text


In [35]:
final_df['sentence'] = final_df['sentence'].apply(text_preprocessing)
final_df.head()

Unnamed: 0,sentence,type
0,hatred for modi loss power once again has made...,subjective
1,Wikipedia was also one of the sources of ChatG...,objective
2,Recursive auto-encoders built atop word embedd...,objective
3,the vision our narendra modi translated realit...,subjective
4,"Reactions to the ceremony were mixed.In 2023, ...",objective


In [36]:
final_df['type'].value_counts()

type
subjective    1529
objective     1529
Name: count, dtype: int64

In [41]:
possible_labels = final_df.type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'subjective': 0, 'objective': 1}

In [43]:
final_df['label'] = final_df.type.replace(label_dict)
final_df.head()

  final_df['label'] = final_df.type.replace(label_dict)


Unnamed: 0,sentence,type,label
0,hatred for modi loss power once again has made...,subjective,0
1,Wikipedia was also one of the sources of ChatG...,objective,1
2,Recursive auto-encoders built atop word embedd...,objective,1
3,the vision our narendra modi translated realit...,subjective,0
4,"Reactions to the ceremony were mixed.In 2023, ...",objective,1


In [45]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(final_df.index.values, 
                                                  final_df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=11, 
                                                  stratify=final_df.label.values)


In [52]:
final_df['data_type'] = ['not_set']*final_df.shape[0]

final_df.loc[X_train, 'data_type'] = 'train'
final_df.loc[X_val, 'data_type'] = 'val'


In [53]:
final_df.groupby(['type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sentence
type,label,data_type,Unnamed: 3_level_1
objective,1,train,1223
objective,1,val,306
subjective,0,train,1223
subjective,0,val,306


# Encode data

In [54]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [56]:
encoded_data_train = tokenizer.batch_encode_plus(
    final_df[final_df.data_type=='train'].sentence.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    final_df[final_df.data_type=='val'].sentence.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(final_df[final_df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(final_df[final_df.data_type=='val'].label.values)


In [57]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


In [58]:
len(dataset_train), len(dataset_val)

(2446, 612)

In [59]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)


In [61]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)



In [77]:
# Using Early stopper to stop when the F1 Score prediction drops 
class EarlyStopper:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_f1_score = 0
        self.min_accuracy = 0

    def early_stop_f1(self, f1_score):
        if f1_score > self.min_f1_score:
            self.min_f1_score = f1_score
            self.counter = 0
        elif f1_score <= (self.min_f1_score + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                self.counter = 0
                return True
        return False

    def early_stop_accuracy(self, accuracy):
        if accuracy > self.min_accuracy:
            self.min_accuracy = accuracy
            self.counter = 0
        elif accuracy < (self.min_accuracy + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                self.counter = 0
                return True
        return False

# Train Model

In [99]:
epochs = 25

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

early_stopper = EarlyStopper(patience=3, min_delta=0)

In [100]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    total_preds = 0
    total_correct = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        
        total_preds += len(y_preds)
        num_correct = len(y_preds[y_preds==label])
        total_correct += num_correct
        acc = num_correct/len(y_true)
        
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {acc}')
        print(f'Accuracy (in percentage): {acc*100:.3f}\n')
    
    print(f'Total accuracy: {(total_correct/total_preds)*100:.3f}')
    print('='*50, '\n')

In [101]:
import random

seed_val = 11
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [102]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [103]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [104]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
              
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
    # Check for early stop
    if early_stopper and early_stopper.early_stop_f1(val_f1):
        print(f"Early stopping at epoch {epoch} due to no improvement in f1 score.")
        torch.save(model.state_dict(), f'finetuned_BERT_obj_epoch_{epoch}.model')
        break

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/77 [00:00<?, ?it/s]


Epoch 1
Training loss: 4.092097860139601e-05
Validation loss: 0.002468238110668608
F1 Score (Weighted): 0.9983660087092804


Epoch 2:   0%|          | 0/77 [00:00<?, ?it/s]


Epoch 2
Training loss: 3.7247116850196177e-05
Validation loss: 0.002556029464540188
F1 Score (Weighted): 0.9983660087092804


Epoch 3:   0%|          | 0/77 [00:00<?, ?it/s]


Epoch 3
Training loss: 3.4662628812449316e-05
Validation loss: 0.002327366713234369
F1 Score (Weighted): 0.9983660087092804


Epoch 4:   0%|          | 0/77 [00:00<?, ?it/s]


Epoch 4
Training loss: 3.1750364719123534e-05
Validation loss: 0.0021637316889609793
F1 Score (Weighted): 0.9983660087092804
Early stopping at epoch 4 due to no improvement in f1 score.


# Test model and get accuracy

In [105]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [106]:
epoch = 4
model.load_state_dict(torch.load(f'finetuned_BERT_obj_epoch_{epoch}.model', map_location=torch.device('cuda')))
print('Model loaded, getting predictions...')
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)
f1_score = f1_score_func(predictions, true_vals)


Model loaded, getting predictions...
Class: subjective
Accuracy: 0.9967320261437909
Accuracy (in percentage): 99.673

Class: objective
Accuracy: 1.0
Accuracy (in percentage): 100.000

Total accuracy: 99.837



# Test with custom data

In [118]:
model.load_state_dict(torch.load(f'finetuned_BERT_obj_epoch_{epoch}.model', map_location=torch.device('cuda')))

text = input("Enter some text: ")
text = text_preprocessing(text)

encoded_text = tokenizer.encode_plus(
    text, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)
# print(encoded_text)

input_ids_text = encoded_text['input_ids']
attention_masks_text = encoded_text['attention_mask']

dataset = TensorDataset(input_ids_text, attention_masks_text)
dataloader = DataLoader(dataset, 
                        sampler=RandomSampler(dataset), 
                        batch_size=1)

for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1]
             }

    with torch.no_grad():        
        outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

#     print(logits)
    pred = np.argmax(logits, axis=1).flatten()[0]
    print(f'Prediction: {list(label_dict.keys())[pred]}')


Enter some text: ChatGPT has copyright issues
Prediction: objective
