# Sentiment Anlysis
- In this file we will perform sentiment analysis but the use of `LLM`

# Import Packages

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import pandas as pd
import numpy as np
import torch
import transformers
import datasets
from datasets import load_dataset

# Load Dataset

In [3]:
# Download the dataset
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 109MB/s]


In [4]:
# Unzip the dataset
!unzip twitter-entity-sentiment-analysis.zip

Archive:  twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


In [5]:
df = pd.read_csv("/content/twitter_training.csv")
df.head(2)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


# Observation
- First we can add a columns name.
- Second we can remove unecessary columns.

In [6]:
df.columns = ["id", "entity", "sentiment", "content"]
df.head(2)

Unnamed: 0,id,entity,sentiment,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


In [7]:
df.drop(columns=["id", "entity"], inplace=True)
df.head(2)

Unnamed: 0,sentiment,content
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...


# Step we can do
- first we can remove null and duplicates valeus.
- second we can convert the data into hugging face datasets so that its easy for me apply preprocessing.
- third we can do tokenization.
- fourth we can train the model.
- we can also encode the sentiment labels

In [8]:
df.isnull().sum()

Unnamed: 0,0
sentiment,0
content,686


In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Unnamed: 0,0
sentiment,0
content,0


In [11]:
df.duplicated().sum()

4227

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.duplicated().sum()

0

In [14]:
df.shape

(69768, 2)

In [15]:
df=df.sample(10000)

In [16]:
df.reset_index(drop=True, inplace=True)

In [17]:
df['sentiment'].unique()

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [18]:
df['sentiment'] = df['sentiment'].map({
    "Positive": 0,  # Changed to 0
    "Negative": 1,  # Changed to 1
    "Neutral": 2,   # Changed to 2
    'Irrelevant': 3 # Changed to 3
})

In [19]:
df.head(2)

Unnamed: 0,sentiment,content
0,3,if any of you find any fortnite problems in wt...
1,1,"That ’ s only dead how it be, gotta wait till ..."


# Convert the data into hugging face dataset

In [20]:
data=datasets.Dataset.from_pandas(df)
data

Dataset({
    features: ['sentiment', 'content'],
    num_rows: 10000
})

# split the data into train and test set

In [21]:
data=data.train_test_split(test_size=0.2)

In [22]:
data

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'content'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentiment', 'content'],
        num_rows: 2000
    })
})

In [23]:
# train data
train_data=data["train"]


# test data
test_data=data["test"]

train_data.shape, test_data.shape

((8000, 2), (2000, 2))

# Load Tokenizer and Model

In [49]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

# Test the tokenizer

In [25]:
train_data[0]["content"]

'Creepy geek who is not only a thief of intellectual property, but also acts like a liberal.'

In [26]:
# Encode the text
encode_embedding = tokenizer(train_data[0]["content"])

In [27]:
encode_embedding

{'input_ids': [5895, 15, 102, 63, 25328, 113, 19, 59, 163, 3, 9, 3, 7436, 15, 89, 13, 8445, 785, 6, 68, 92, 6775, 114, 3, 9, 10215, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
# Decode the embedding
tokenizer.decode(encode_embedding['input_ids'])

'Creepy geek who is not only a thief of intellectual property, but also acts like a liberal.</s>'

# Apply tokenization
- For this step we can make a custom dataset class for data loading

In [29]:
from torch.utils.data import Dataset, DataLoader
from torch import nn

In [30]:
class CutomData(Dataset):
    def __init__(
            self,data,tokenizer
    ):
        self.data = data
        self.content = data['content']
        self.sentiment = data['sentiment']
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        # get the content and sentiment of the specific index
        content = self.content[idx]
        sentiment = self.sentiment[idx]

        # T5 Model expectd decoded input id
        decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]])

        # apply tokenization
        content_embedding = self.tokenizer(content, padding = "max_length",truncation = True, return_tensors = "pt",max_length = 300)

        return {
            'input_ids' : content_embedding['input_ids'],
            'attention_mask' : content_embedding['attention_mask'],
            'labels' : torch.tensor(sentiment , dtype = torch.long)
        }

In [31]:
# Load the tokenize data
# Train data
tokenize_train_data = CutomData(train_data, tokenizer)

# Test data
tokenize_test_data = CutomData(test_data, tokenizer)

In [77]:
# Make a dataloader

# Train Loader
train_loader = DataLoader(tokenize_train_data, batch_size = 8, shuffle = True)

# Test Loader
test_loader = DataLoader(tokenize_test_data, batch_size = 8, shuffle = False)

In [78]:
# Check the data batch
for batch in train_loader:
    print(batch)
    break

{'input_ids': tensor([[[19716,    46,     3,  ...,     0,     0,     0]],

        [[  816,    82,   126,  ...,     0,     0,     0]],

        [[  368,  9747, 10527,  ...,     0,     0,     0]],

        ...,

        [[  100,    19,     6,  ...,     0,     0,     0]],

        [[24159,     6,    19,  ...,     0,     0,     0]],

        [[23908,  5279,     7,  ...,     0,     0,     0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]]), 'labels': tensor([1, 0, 2, 3, 2, 1, 1, 2])}


# Observation
- We can make a dataset that we can use for model training.
- we can train the model for a single batch so that we can see if all the training should be fine.
- if all are fine we can train the model on full dataset.

# View the model

In [79]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

#  Change the model out_features

In [51]:
# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

In [52]:
print("Model: ",model.lm_head)
# Model in_features
in_features = model.lm_head.in_features
print("in_features: ",in_features)

# Model Out_features
out_featurs = model.lm_head.out_features
print("out_featurs: ",out_featurs)

Model:  Linear(in_features=768, out_features=32128, bias=False)
in_features:  768
out_featurs:  32128


In [53]:
# Update the lm_head for a 4-class classification task
model.lm_head = nn.Linear(model.lm_head.in_features, 4)

In [54]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [55]:
# Unfreeze only the updated layer
for param in model.lm_head.parameters():
    param.requires_grad = True

# Observation
- Now we can update the final classification layer.
- Now we can train the model

# set the optimizer and loss fun

In [56]:
# optimzer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Loss
loss_fn = torch.nn.CrossEntropyLoss()

In [57]:
optimizer, loss_fn

(AdamW (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 5e-05
     maximize: False
     weight_decay: 0.01
 ),
 CrossEntropyLoss())

# Make a compute fun
- In compute fun will calculate the accuracy, precession, recall and f1 score

In [58]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(predictions, labels, average='weighted'):
    """
    Compute accuracy, precision, recall, and F1 score.

    Args:
        predictions (list or np.array): Predicted labels.
        labels (list or np.array): True labels.
        average (str): Type of averaging for multi-class metrics.
                       Options: 'micro', 'macro', 'weighted'. Default is 'macro'.

    Returns:
        dict: A dictionary with accuracy, precision, recall, and F1 score.
    """
    # Accuracy
    accuracy = accuracy_score(labels, predictions)

    # Precision, Recall, F1 Score
    precision = precision_score(labels, predictions, average=average, zero_division=0)
    recall = recall_score(labels, predictions, average=average, zero_division=0)
    f1 = f1_score(labels, predictions, average=average, zero_division=0)

    # Return results as a dictionary
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# Train the model in a single batch

In [61]:
from tqdm import tqdm

In [80]:
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [81]:
from tqdm import tqdm

# Move the model to the correct device
model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()  # Ensure the model is in training mode

    # Training loop
    for batch in tqdm(train_loader, desc="Training"):
        # Move inputs and labels to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        # Prepare decoder inputs if needed
        decoder_input_ids = labels.unsqueeze(1) # Example of shifting labels for decoder input

        # Zero the gradients from the previous step
        optimizer.zero_grad()

        # Forward pass through the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels
        )

        # Get the loss value
        loss = outputs.loss

        # Backpropagate the loss
        loss.backward()

        # Update model parameters
        optimizer.step()
        break

    # Evaluation at the end of the epoch
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            labels = batch['labels'].to(device)

            # Prepare decoder inputs if needed
            decoder_input_ids = labels.unsqueeze(1)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                labels=labels
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            break
    # Calculate metrics
    results = compute_metrics(all_preds, all_labels, average='weighted')
    print(f"Epoch {epoch + 1} Metrics: {results}")
    print(f"Loss: {loss.item()}")


Epoch 1/3


Training:   0%|          | 0/1000 [00:00<?, ?it/s]
Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 1 Metrics: {'accuracy': 0.625, 'precision': 0.475, 'recall': 0.625, 'f1_score': 0.5178571428571428}
Loss: 1.2387237548828125
Epoch 2/3


Training:   0%|          | 0/1000 [00:00<?, ?it/s]
Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 2 Metrics: {'accuracy': 0.625, 'precision': 0.475, 'recall': 0.625, 'f1_score': 0.5178571428571428}
Loss: 1.2285637855529785
Epoch 3/3


Training:   0%|          | 0/1000 [00:00<?, ?it/s]
Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 3 Metrics: {'accuracy': 0.625, 'precision': 0.475, 'recall': 0.625, 'f1_score': 0.5178571428571428}
Loss: 1.2929673194885254





# Observation
- we can train the model in a single batch its working.
- Now we can train the model in full dataset.

# Full Training

In [82]:
from tqdm import tqdm

# Move the model to the target device
model.to(device)
epochs = 2
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()  # Ensure the model is in training mode

    # Training loop
    for batch in tqdm(train_loader, desc="Training"):
        # Move inputs and labels to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        # Prepare decoder inputs if needed
        decoder_input_ids = labels.unsqueeze(1) # Example of shifting labels for decoder input

        # Zero the gradients from the previous step
        optimizer.zero_grad()

        # Forward pass through the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels
        )

        # Get the loss value
        loss = outputs.loss

        # Backpropagate the loss
        loss.backward()

        # Update model parameters
        optimizer.step()

    # Evaluation at the end of the epoch
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            labels = batch['labels'].to(device)

            # Prepare decoder inputs if needed
            decoder_input_ids = labels.unsqueeze(1)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                labels=labels
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    # Calculate metrics
    results = compute_metrics(all_preds, all_labels, average='weighted')
    print(f"Epoch {epoch + 1} Metrics: {results}")
    print(f"Loss: {loss.item()}")


Epoch 1/2


Training: 100%|██████████| 1000/1000 [02:51<00:00,  5.83it/s]
Evaluating: 100%|██████████| 250/250 [00:40<00:00,  6.17it/s]


Epoch 1 Metrics: {'accuracy': 0.825, 'precision': 0.7161779303062302, 'recall': 0.825, 'f1_score': 0.7580825321238197}
Loss: 1.2776035070419312
Epoch 2/2


Training: 100%|██████████| 1000/1000 [02:51<00:00,  5.84it/s]
Evaluating: 100%|██████████| 250/250 [00:40<00:00,  6.17it/s]


Epoch 2 Metrics: {'accuracy': 0.825, 'precision': 0.7161779303062302, 'recall': 0.825, 'f1_score': 0.7580825321238197}
Loss: 1.176134705543518


# Prediction
- Our model is train successfully now we can use the model for prediction.

In [85]:
def prediction(text):
    """
    - This fun is responsible for prediction only.
    - It can take the text and doc prediction.
    """
    encoded_text = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=300)
    input_ids = encoded_text["input_ids"].to(device)
    attention_mask = encoded_text["attention_mask"].to(device)
    decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,decoder_input_ids=decoder_input_ids)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Reverse the label mapping to get the original sentiment
    sentiment_mapping = {
        0: "Positive",
        1: "Negative",
        2: "Neutral",
        3: "Irrelevant"
    }
    predicted_sentiment = sentiment_mapping.get(predicted_class, "Unknown")

    return predicted_sentiment,predicted_class,logits

In [88]:
# prompt: do prediction

text = "I hate this movie. It's the worst thing I've ever seen."
print(prediction(text))

('Positive', 0, tensor([[[ 0.4300, -0.0902, -0.2312, -0.4145]]], device='cuda:0'))


In [None]:
text = "Creepy geek who is not only a thief of intellectual property, but also acts like a liberal.</s>"
print(prediction(text))