# Importing Necessary Libraries

In [None]:
!pip install -q datasets bitsandbytes einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m126.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.9 MB/s[0m eta [36

In [None]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer



In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


In [None]:
!pip install nltk
import nltk
import re
import string
import numpy as np
from os import getcwd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer




In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### importing our csv

In [None]:
import pandas as pd
import re
df=pd.read_csv('hate.csv')

There were some rows which had label 'O' in it and in that rows the label was present in comment column , so following code catches the label from comment using regex and drops the rows if label not found


In [None]:
regex_pattern = r"\?,([PN])\"?$"

def extract_and_update_labels(row):
    if row['label'] == 'O':
        match = re.search(regex_pattern, row['comment'])
        if match:
            return match.group(1)
    return row['label']


df['label'] = df.apply(extract_and_update_labels, axis=1)

df = df[df['label'] != 'O']

# Print the updated DataFrame
print(df.label.unique())

['N' 'P']


## Following code preprocesses the tweets

In [None]:
def process_tweet(tweet):

    stopwords_list = set(stopwords.words('english'))

    stemmer = PorterStemmer()

    # Convert to lowercase
    tweet = tweet.lower()

    # Removing stock market tickers like $GE
    tweet = re.sub(r'\$\w+', '', tweet)

    # Removing old style retweet text "RT"
    tweet = re.sub(r'^rt[\s]+', '', tweet)

    # Removing hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)

    # Removing hashtags (keeping the '#' symbol)
    tweet = re.sub(r'#', '', tweet)

    # Tokenize tweets using regex to split by non-word characters
    words = re.split(r'\W+', tweet)

    # Filter stopwords and punctuation, and apply stemming
    tweets_clean = [stemmer.stem(word) for word in words if word not in stopwords_list and word not in string.punctuation]

    # Join the cleaned words to form the processed tweet
    processed_tweet = " ".join(tweets_clean)

    return processed_tweet
df['comment']=df['comment'].apply(process_tweet)

In [None]:
df['comment']=df['comment'].apply(lambda x: x.lower())

In [None]:
df["label"] = df["label"].replace({"N":0,"P":1})

## Converting the dataframe to huggingface dataset

In [None]:
from datasets import Dataset,load_dataset, load_from_disk, DatasetDict

dataset = Dataset.from_pandas(df,preserve_index=False)
dataset = dataset.train_test_split(shuffle = True, seed = 200, test_size=0.3)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
   return tokenizer(examples["comment"], truncation=True)
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/28798 [00:00<?, ? examples/s]

Map:   0%|          | 0/12342 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## Loading the model - distilbert-base-uncased

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Creating metric calculation function

In [None]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Defining training arguments

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-40000-samples"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6291
1000,0.5581
1500,0.533
2000,0.4971
2500,0.4607


Step,Training Loss
500,0.6291
1000,0.5581
1500,0.533
2000,0.4971
2500,0.4607
3000,0.4462
3500,0.4423


TrainOutput(global_step=3600, training_loss=0.5078669463263618, metrics={'train_runtime': 14209.9907, 'train_samples_per_second': 4.053, 'train_steps_per_second': 0.253, 'total_flos': 942346684049208.0, 'train_loss': 0.5078669463263618, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

'https://huggingface.co/SarthakBhatore/finetuning-sentiment-model-40000-samples/tree/main/'

## We got 72 % accuracy in our model

In [None]:
trainer.evaluate()


  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.5091237425804138,
 'eval_accuracy': 0.7439637011829525,
 'eval_f1': 0.7079482439926063,
 'eval_runtime': 940.0965,
 'eval_samples_per_second': 13.128,
 'eval_steps_per_second': 0.821,
 'epoch': 2.0}