In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Python Package Installation for NLP and Machine Learning

%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

## Import Libraries

In [None]:
%%capture

import torch
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')
from transformers import TrainingArguments
from huggingface_hub import notebook_login
from transformers import Trainer

##others
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

In [None]:
# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"

## Importing Dataset

In [None]:
# Loading Data from CSV File
data_path= '/content/drive/MyDrive/capstone/movie_review_dataset.csv'

In [None]:
data= pd.read_csv(data_path)

In [None]:
data.head()

Unnamed: 0,content,label,clean_content
0,recently shown on cable tv the movie opens wit...,1,recently shown on cable tv the movie opens wit...
1,i was very surprised with this film. i was tou...,1,i was very surprised with this film i was tou...
2,"now, i'm one to watch movies that got poor rev...",0,now i m one to watch movies that got poor rev...
3,"this film came out 12 years years ago, and was...",1,this film came out 12 years years ago and was...
4,"when an orphanage manager goes on vacation, hi...",1,when an orphanage manager goes on vacation hi...


In [None]:
## checking for missing values

data.isna().sum()

content          0
label            0
clean_content    0
dtype: int64

### Data Splitting

In [None]:
train, eval = train_test_split(data, test_size= 0.2, random_state= 42,stratify=data['label'])

In [None]:
train.shape

(19923, 3)

In [None]:
eval.shape

(4981, 3)

 # iii. Loading Datasets using Load_dataset

In [None]:
train.to_csv("/content/drive/MyDrive/capstone/train_set.csv")
eval.to_csv("/content/drive/MyDrive/capstone/eval_set.csv")

In [None]:
dataset= load_dataset("csv", data_files={"train_set":"/content/drive/MyDrive/capstone/train_set.csv", "eval_set":"/content/drive/MyDrive/capstone/eval_set.csv" }, encoding= "ISO-8859-1")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'content', 'label', 'clean_content'],
        num_rows: 19923
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'content', 'label', 'clean_content'],
        num_rows: 4981
    })
})

###. Tokenization

In [None]:
# Initializing RoBERTa Tokenizer

tokenizer= AutoTokenizer.from_pretrained("roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## our labels are-1, and 1 and we will like to transform them into 0,1, respectively

def transform_labels(input):
  label= input["label"]
  num =0

  if label== 0:
    num= 0  ##for negative sentiment
  elif label== 1:
    num =1 ##for positive sentiment
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_content"], padding= "max_length", truncation=True, return_tensors= "pt")

In [None]:
# Data Preprocessing for NLP: Tokenization and Label Transformation
dataset= dataset.map(tokenize, batched= True)
remove_columns=  ['Unnamed: 0', 'label', 'content', 'clean_content']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/19923 [00:00<?, ? examples/s]

Map:   0%|          | 0/4981 [00:00<?, ? examples/s]

Map:   0%|          | 0/19923 [00:00<?, ? examples/s]

Map:   0%|          | 0/4981 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 19923
    })
    eval_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4981
    })
})

## ii.Modelling

In [None]:
# Initializing a Binary Sequence Classification Model with RoBERTa-Base
model= AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels= 2)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
# Sentiment Analysis Metric Computation Function
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  return {"accuracy": accuracy}

In [None]:
# Setting the Batch Size to 16
batch_size= 16

In [None]:

# Initializing the Model Trainer with Training Configuration
training_args = TrainingArguments(
    output_dir="Roberta-capstone_2",
    num_train_epochs=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    per_device_train_batch_size=16,  # Reducing batch size can sometimes mitigate overfitting
    weight_decay=0.01,  # Adding weight decay to handle overfitting
    gradient_accumulation_steps=2  # This helps in effectively reducing the batch size

)

In [None]:
train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [None]:
# Login unto huggingface

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Initializing the Model Trainer with Training Configuration
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics

)

In [None]:
#Training the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2682,0.225415,0.912668
2,0.1648,0.225408,0.935756
3,0.1074,0.31139,0.93696
4,0.0692,0.327811,0.93676
5,0.0202,0.379392,0.936157


TrainOutput(global_step=3115, training_loss=0.10782695834556322, metrics={'train_runtime': 9981.2965, 'train_samples_per_second': 9.98, 'train_steps_per_second': 0.312, 'total_flos': 2.62098077796864e+16, 'train_loss': 0.10782695834556322, 'epoch': 5.0})

In [None]:
# Pushing trainer to huggingface
trainer.push_to_hub()

'https://huggingface.co/Sonny4Sonnix/Roberta-capstone_2/tree/main/'

In [None]:
# Calculating Accuracy Metrics for NLP Models

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
# Launch the final evaluation
trainer.evaluate()

{'eval_loss': 0.2254079431295395,
 'eval_accuracy': 0.9357558723147962,
 'eval_runtime': 155.0814,
 'eval_samples_per_second': 32.119,
 'eval_steps_per_second': 4.017,
 'epoch': 5.0}

In [None]:
# Access the training logs
training_logs = trainer.state.log_history
training_logs

[{'loss': 0.2682,
  'learning_rate': 4.197431781701445e-05,
  'epoch': 0.8,
  'step': 500},
 {'eval_loss': 0.2254152148962021,
  'eval_accuracy': 0.9126681389279261,
  'eval_runtime': 154.3973,
  'eval_samples_per_second': 32.261,
  'eval_steps_per_second': 4.035,
  'epoch': 1.0,
  'step': 623},
 {'loss': 0.1648,
  'learning_rate': 3.394863563402889e-05,
  'epoch': 1.61,
  'step': 1000},
 {'eval_loss': 0.2254079431295395,
  'eval_accuracy': 0.9357558723147962,
  'eval_runtime': 154.1669,
  'eval_samples_per_second': 32.309,
  'eval_steps_per_second': 4.041,
  'epoch': 2.0,
  'step': 1246},
 {'loss': 0.1074,
  'learning_rate': 2.592295345104334e-05,
  'epoch': 2.41,
  'step': 1500},
 {'eval_loss': 0.3113899230957031,
  'eval_accuracy': 0.9369604497088938,
  'eval_runtime': 154.0472,
  'eval_samples_per_second': 32.334,
  'eval_steps_per_second': 4.044,
  'epoch': 3.0,
  'step': 1869},
 {'loss': 0.0692,
  'learning_rate': 1.7897271268057786e-05,
  'epoch': 3.21,
  'step': 2000},
 {'eval_