In [1]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

In [2]:
%%capture

import torch
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, DistilBertConfig
import torch.nn as nn
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')
from transformers import Trainer
##others
import warnings
warnings.filterwarnings("ignore")
import os
from huggingface_hub import notebook_login
from transformers import TrainingArguments
os.environ["WANDB_DISABLED"] = "true"
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Dataset

In [4]:
data_path= '/content/drive/MyDrive/deep-learning/capstone_data.csv'

In [5]:
data= pd.read_csv(data_path)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,clean_review
0,0,1,recently shown cable tv movie opens disclaimer...
1,1,1,i surprised film i touched lives paulie touche...
2,2,-1,now im one watch movies got poor reviews say h...
3,3,1,this film came twelve years years ago revelati...
4,4,1,when orphanage manager goes vacation father ta...


In [7]:
##checking for missing values

data.isna().sum()

Unnamed: 0      0
sentiment       0
clean_review    0
dtype: int64

In [8]:
##dropping the unneccessary column

data= data.drop(labels= "Unnamed: 0", axis=1)

In [9]:
data.head()

Unnamed: 0,sentiment,clean_review
0,1,recently shown cable tv movie opens disclaimer...
1,1,i surprised film i touched lives paulie touche...
2,-1,now im one watch movies got poor reviews say h...
3,1,this film came twelve years years ago revelati...
4,1,when orphanage manager goes vacation father ta...


In [10]:
data= data.rename(columns={'sentiment': 'label'})

### Data Splitting

In [11]:
train, eval = train_test_split(data, test_size= 0.2, random_state= 50)

In [12]:
train.shape ##checking the number of rows and columns

(20000, 2)

In [13]:
eval.shape  ##checking the number of rows and columns

(5000, 2)

 # iii. Loading Datasets using Load_dataset

In [14]:
train.to_csv("/content/train_set.csv")
eval.to_csv("/content/eval_set.csv")

In [15]:
##loading data in appopiriate format
dataset= load_dataset("csv", data_files={"train_set":"train_set.csv", "eval_set":"eval_set.csv" }, encoding= "ISO-8859-1")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

In [16]:
##checking the dataset
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 5000
    })
})

###. Tokenization

In [17]:
##downloading the tokenizer
tokenizer= AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
## our labels are-1, and 1 and we will like to transform them into 0,1, respectively

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0  ##for negative sentiment
  elif label== 1:
    num =1 ##for positive sentiment
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_review"], padding= "max_length", truncation=True, return_tensors= "pt")

In [19]:
##converting data into token
dataset= dataset.map(tokenize, batched= True)
remove_columns=  ['Unnamed: 0', 'label', 'clean_review']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

## ii.Modelling

In [21]:
train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [22]:
##model= DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels= 2)

In [23]:
class CustomDistilBertForSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
        super(CustomDistilBertForSequenceClassification, self).__init__(config)

        self.distilbert.dropout = nn.Dropout(0.5)  # Change dropout rate for the DistilBert model
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.pre_classifier.dropout = nn.Dropout(0.5)  # Change dropout rate for the pre-classifier
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(0.5)  # Change the final dropout layer

In [24]:
# Load the DistilBERT config
config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=2)

# Initialize your custom model with the new config
model = CustomDistilBertForSequenceClassification(config)








In [25]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  return {"accuracy": accuracy}

In [26]:
batch_size= 16

In [29]:


training_args = TrainingArguments(
    output_dir="Distilbert-For-Capstone",
    num_train_epochs=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    per_device_train_batch_size=16)
    #weight_decay=0.01,
    #gradient_accumulation_steps=2)

In [30]:


notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics

)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3487,0.304393,0.8794
2,0.2338,0.346133,0.8828
3,0.1482,0.497104,0.8778
4,0.0986,0.56412,0.8804
5,0.0607,0.628208,0.8804


TrainOutput(global_step=6250, training_loss=0.18486887512207031, metrics={'train_runtime': 5439.1704, 'train_samples_per_second': 18.385, 'train_steps_per_second': 1.149, 'total_flos': 1.32467398656e+16, 'train_loss': 0.18486887512207031, 'epoch': 5.0})

In [33]:
trainer.push_to_hub()

'https://huggingface.co/gArthur98/Distilbert-For-Capstone/tree/main/'