# **Text Classification Sentiment Analysis**

Step-by-Step Amazon Sentiment Classifier with BERT (PyTorch)

In [42]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoModel, BertTokenizerFast
from torch.optim import AdamW
from datasets import load_dataset
import time

In [43]:
# Set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU available :", torch.cuda.is_available)
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

GPU available : <function is_available at 0x788910f3eca0>
Device Name: Tesla T4


In [44]:
!pip install -U datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [45]:
# Load Amazon Polarity dataset
dataset = load_dataset("amazon_polarity")

In [46]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})


In [47]:
# Take subset for faster experimentation (optional)
train_texts = dataset['train']['content'][:20000]
train_labels = dataset['train']['label'][:20000]
test_texts = dataset['test']['content'][:5000]
test_labels = dataset['test']['label'][:5000]


In [48]:
# Train/val split
from sklearn.model_selection import train_test_split

train_text, val_text, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, stratify=train_labels,
    random_state=42)

In [49]:
# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased")

In [50]:
# Tokenization
def tokenize(texts):
    return tokenizer.batch_encode_plus(texts,
                                       max_length=128,
                                       padding='max_length',
                                       truncation=True,
                                       return_tensors="pt")

In [51]:
tokens_train = tokenize(train_text)
tokens_val = tokenize(val_text)
tokens_test = tokenize(test_texts)

In [52]:
# Convert to tensors
train_seq = tokens_train['input_ids']
train_mask = tokens_train['attention_mask']
train_y = torch.tensor(train_labels)


val_seq = tokens_val['input_ids']
val_mask = tokens_val['attention_mask']
val_y = torch.tensor(val_labels)


test_seq = tokens_test['input_ids']
test_mask = tokens_test['attention_mask']
test_y = torch.tensor(test_labels)

In [53]:
# Dataloaders

batch_size = 32
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Define Model Architecture

In [54]:
# Define Model Architecture
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)  # For binary classification
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        output = self.bert(sent_id, attention_mask = mask)
        cls_hs = output.last_hidden_state[:, 0]
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

model = BERT_Arch(bert).to(device)



In [55]:
# Class weights to handle imbalance
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(train_labels),
                                     y=train_labels)

weights = torch.tensor(class_weights, dtype=torch.float).to(device)


# Loss and Optimizer
loss_fn = nn.NLLLoss(weight=weights)
optimizer = AdamW(model.parameters(), lr=1e-5)

**Training Function**

In [56]:
# Training function
def train():
    model.train()
    total_loss = 0
    total_preds = []

    for step, batch in enumerate(train_dataloader):
        sent_id, mask, labels = [b.to(device) for b in batch]
        model.zero_grad()

        preds = model(sent_id, mask)
        loss = loss_fn(preds, labels)
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_preds.append(preds.detach().cpu().numpy())

    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds


**Evaluate Function**

In [57]:
# Evaluation function
def evaluate():
    model.eval()
    total_loss = 0
    total_preds = []

    for batch in val_dataloader:
        sent_id, mask, labels = [b.to(device) for b in batch]

        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = loss_fn(preds, labels)
            total_loss += loss.item()
            total_preds.append(preds.detach().cpu().numpy())

    avg_loss = total_loss / len(val_dataloader)
    total_preds = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

**Train the Model**

In [58]:
# Train the model
epochs = 3
best_valid_loss = float('inf')

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    print(f"Training Loss: {train_loss:.3f}")
    print(f"Validation Loss: {valid_loss:.3f}")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'amazon_sentiment_bert.pt')


Epoch 1/3
Training Loss: 0.281
Validation Loss: 0.239

Epoch 2/3
Training Loss: 0.159
Validation Loss: 0.242

Epoch 3/3
Training Loss: 0.109
Validation Loss: 0.246


**Load the Best Model**

In [59]:
model.load_state_dict(torch.load('amazon_sentiment_bert.pt'))

<All keys matched successfully>

**Prediction**

In [60]:
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler

# Create a DataLoader for the test set
test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

# Predict in batches
all_preds = []
true_labels = []

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        sent_id, mask, labels = [t.to(device) for t in batch]

        outputs = model(sent_id, mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


# Evaluation
from sklearn.metrics import classification_report

print("\nTest Set Performance:\n")
print(classification_report(true_labels, all_preds, target_names=["Negative", "Positive"]))



Test Set Performance:

              precision    recall  f1-score   support

    Negative       0.93      0.90      0.91      2435
    Positive       0.91      0.94      0.92      2565

    accuracy                           0.92      5000
   macro avg       0.92      0.92      0.92      5000
weighted avg       0.92      0.92      0.92      5000



In [61]:
# Prediction function
def predict(text_list):
    model.eval()
    tokens = tokenizer.batch_encode_plus(text_list,
                                         max_length=128,
                                         padding='max_length',
                                         truncation=True,
                                         return_tensors="pt")

    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = torch.nn.functional.softmax(outputs, dim=1)
        preds = torch.argmax(probs, axis=1)
    return preds.cpu().numpy(), probs.cpu().numpy()

# Example inference
sample_texts = ["This product is amazing! I loved it and will buy again."]




predicted_labels, probabilities = predict(sample_texts)

for text, label, prob in zip(sample_texts, predicted_labels, probabilities):
    label_name = "Positive" if label == 1 else "Negative"
    confidence = prob[label]
    print(f"\nText: {text}")
    print(f"Predicted Label: {label_name} (Confidence: {confidence:.2f})")


Text: This product is amazing! I loved it and will buy again.
Predicted Label: Positive (Confidence: 0.99)


**Step-by-Step Code: English to Hindi Translation**

Here's a minimal working example using the MarianMT model (Helsinki-NLP/opus-mt-en-hi)

In [70]:
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained MarianMT model and tokenizer for English to Hindi

translation_model_name = "Helsinki-NLP/opus-mt-en-hi"
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device)

**# Translation Function**

In [71]:
def translate_en_to_hi(text_list):
    # Tokenize input texts
    inputs = translation_tokenizer(text_list,
                       return_tensors="pt",
                       padding=True,
                       truncation=True,
                       max_length=128).to(device)




    # Remove token_type_ids if present (not supported by MarianMT)
    if "token_type_ids" in inputs:
      inputs.pop("token_type_ids")


    # Generate translation
    with torch.no_grad():
        translated_tokens = translation_model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    # Decode translations
    translated_texts = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts


In [72]:
# Example English sentences
english_sentences = [
    "This product is amazing! I loved it and will buy again.",
    "Worst purchase ever. Completely useless and waste of money.",
    "The quality is okay, not too good but acceptable."
]


# Translated to Hindi
translated_hindi = translate_en_to_hi(english_sentences)

# print results
for en, hi in zip(english_sentences, translated_hindi):
  print(f"\nEnglish: {en}\nHindi: {hi}")



English: This product is amazing! I loved it and will buy again.
Hindi: यह उत्पाद कमाल की बात है!

English: Worst purchase ever. Completely useless and waste of money.
Hindi: बहुत सारा पैसा बरबाद और बरबाद हो जाता है ।

English: The quality is okay, not too good but acceptable.
Hindi: यह गुण ठीक है, नहीं भी अच्छा लेकिन स्वीकार्य है ।


# **Combine with Sentiment Model**

In [73]:
for text in sample_texts:
    label, _ = predict([text])
    translated = translate_en_to_hi([text])[0]
    sentiment = "Positive" if label[0] == 1 else "Negative"
    print(f"\nOriginal: {text}")
    print(f"Translation: {translated}")
    print(f"Sentiment: {sentiment}")



Original: This product is amazing! I loved it and will buy again.
Translation: यह उत्पाद कमाल की बात है!
Sentiment: Positive
