In [1]:
!pip install opendatasets --quiet
import opendatasets as od
od.download("https://www.kaggle.com/datasets/abdelmalekeladjelet/sentiment-analysis-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: tanmay01bhatt
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/abdelmalekeladjelet/sentiment-analysis-dataset
Downloading sentiment-analysis-dataset.zip to ./sentiment-analysis-dataset


100%|██████████| 8.68M/8.68M [00:00<00:00, 973MB/s]







In [2]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
data = pd.read_csv('/content/sentiment-analysis-dataset/sentiment_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,lets forget apple pay required brand new iphon...,1
1,1,nz retailers don’t even contactless credit car...,0
2,2,forever acknowledge channel help lessons ideas...,2
3,3,whenever go place doesn’t take apple pay doesn...,0
4,4,apple pay convenient secure easy use used kore...,2


In [4]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
data.dropna(inplace=True)

In [6]:
data.isnull().sum()

Unnamed: 0,0
Comment,0
Sentiment,0


In [7]:
data

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay required brand new iphon...,1
1,nz retailers don’t even contactless credit car...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2
...,...,...
241140,crores paid neerav modi recovered congress lea...,0
241141,dear rss terrorist payal gawar modi killing pl...,0
241142,cover interaction forum left,1
241143,big project came india modi dream project happ...,1


In [8]:
data = data.sample(n=9000,random_state=42).reset_index(drop=True)
data.shape

(9000, 2)

# Cleaning

In [9]:
import re
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
lemm = nltk.stem.WordNetLemmatizer()

In [11]:
def clean(text):
  text = text.lower()
  text = re.sub(r"\d", '', text)
  text = re.sub(r"[^\w\s]", '', text)
  tokens = text.split()
  tokens = [lemm.lemmatize(token) for token in tokens]
  return ' '.join(tokens)

In [12]:
data['cleaned'] = data['Comment'].apply(clean)

In [13]:
data.head()

Unnamed: 0,Comment,Sentiment,cleaned
0,indian giving message country going ahead new ...,2,indian giving message country going ahead new ...
1,hating modi trend loom replies instant famous,2,hating modi trend loom reply instant famous
2,hahaha modi khauff stronger sibling rivalries,2,hahaha modi khauff stronger sibling rivalry
3,watch official trailer modi subscribe erosnow ...,1,watch official trailer modi subscribe erosnow ...
4,called concern poor modi bjp doesnt poor count...,0,called concern poor modi bjp doesnt poor count...


# Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(data['cleaned'], data['Sentiment'], test_size=0.2)

# Tokenization

In [16]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
train_encodings = tokenizer(list(x_train), padding=True, truncation=True, max_length=200)
test_encodings = tokenizer(list(x_test), padding=True, truncation=True, max_length=200)

# Dataset

In [18]:
from torch.utils.data import DataLoader
from datasets import Dataset

In [19]:
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'],
                                   'attention_mask': train_encodings['attention_mask'],
                                   'labels': y_train.tolist()})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'],
                                  'attention_mask': test_encodings['attention_mask'],
                                  'labels': y_test.tolist()})

# Model

In [20]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# LORA

In [22]:
from peft import LoraConfig, get_peft_model

In [23]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # BERT attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

In [24]:
model = get_peft_model(model, lora_config)

In [25]:
model.print_trainable_parameters()

trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


In [26]:
model = model.to(device)

# Train

In [27]:
from transformers import TrainingArguments, Trainer

In [28]:
training_args = TrainingArguments(
    output_dir="./sentiment-lora",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=True,
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [30]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.8683,0.830168
2,0.7814,0.772112
3,0.71,0.71247
4,0.6263,0.694223
5,0.644,0.69316


TrainOutput(global_step=2250, training_loss=0.7626296971638997, metrics={'train_runtime': 607.621, 'train_samples_per_second': 59.247, 'train_steps_per_second': 3.703, 'total_flos': 3712872297600000.0, 'train_loss': 0.7626296971638997, 'epoch': 5.0})

#Inference

In [31]:
model.eval()
def predict_sentiment(text):
    # Tokenize input text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    pred_id = torch.argmax(logits, dim=-1).item()


    label_map = {
        0: "Negative",
        1: "Neutral",
        2: "Positive"
    }
    return label_map.get(pred_id)

In [34]:
def sent(text):
  sentiment = predict_sentiment(text)
  print(f"Text: {text}")
  print(f"Predicted sentiment: {sentiment}")

In [35]:
text = 'That was the worst movie I have ever seen.'
out = sent(text)

Text: That was the worst movie I have ever seen.
Predicted sentiment: Negative


In [36]:
text = 'I absolutely loved the service and the experience!'
out = sent(text)

Text: I absolutely loved the service and the experience!
Predicted sentiment: Positive


In [37]:
text = 'The food was okay, not great but not terrible'
out = sent(text)

Text: The food was okay, not great but not terrible
Predicted sentiment: Neutral
