In [71]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import LabelEncoder
from langdetect import detect
import re
import nltk
from nltk.tokenize import word_tokenize

In [43]:
nltk.download('punkt_tab')
print(nltk.data.path)

['/home/defalt/nltk_data', '/mnt/partition1/machine_learning/Bengali_Sentiment_Analysis_and_Classification/venv/nltk_data', '/mnt/partition1/machine_learning/Bengali_Sentiment_Analysis_and_Classification/venv/share/nltk_data', '/mnt/partition1/machine_learning/Bengali_Sentiment_Analysis_and_Classification/venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


[nltk_data] Downloading package punkt_tab to /home/defalt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Ensure GPU usage

In [44]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


## Load Bengali Sentiment Dataset

In [None]:

df = pd.read_csv("/mnt/partition1/machine_learning/Bengali_Sentiment_Analysis_and_Classification/dataset.csv")
df.head()

## Select relevant columns

In [None]:
df = df[["Review", "Sentiment"]]
df.columns = ["text", "label"]
df.head()

## Convert labels to numeric values (0: Negative, 1: Positive)

In [None]:
label_mapping = {"negative": 0, "positive": 1}
df[["label"]] = df[["label"]].apply(LabelEncoder().fit_transform)
df.head(100)

## Preprocessing Mixed-Language Text

In [None]:
def preprocess_text(text):
    print(f"Before tokenized : {text}")
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r"[\d]+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\sঀ-৿.,!?₹$]", "", text)  # Keep Bengali and English letters only
    tokens = word_tokenize(text)  # Tokenize using NLTK
    tokenized_text = " ".join(tokens)  # Convert tokens back to string
    print(f"after tokenized : {tokenized_text}")
    return tokenized_text

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df["text"] = df["text"].apply(preprocess_text)
df["language"] = df["text"].apply(detect_language)
df.head()

## Split Data into Train & Test Sets

In [49]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

## Convert to Hugging Face Dataset Format

In [50]:
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

## Train SVM Classifier

In [51]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_test_tfidf = vectorizer.transform(test_texts)

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, train_labels)

svm_predictions = svm_model.predict(X_test_tfidf)

## Compute Metrics for SVM

In [52]:
svm_accuracy = accuracy_score(test_labels, svm_predictions)
svm_precision = precision_score(test_labels, svm_predictions, average='weighted')
svm_f1 = f1_score(test_labels, svm_predictions, average='weighted')
svm_recall = recall_score(test_labels, svm_predictions)

print(f"SVM Accuracy: {svm_accuracy:.4f}, Precision: {svm_precision:.4f}, F1 Score: {svm_f1:.4f}, Recall Score: {svm_recall:.4f}")

SVM Accuracy: 0.9098, Precision: 0.9023, F1 Score: 0.9023, Recall Score: 0.9736


## Train BanglaBERT Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
model = BertForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=2)
model.to(device)

# Tokenizing the dataset for BanglaBERT
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = tokenize_data(train_texts.tolist())
test_encodings = tokenize_data(test_texts.tolist())

# Converting labels to tensor
train_labels = torch.tensor(train_labels.values).to(device)
test_labels = torch.tensor(test_labels.values).to(device)

# Dataloader for BanglaBERT model
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

# Training the BanglaBERT model

for epoch in range(10):  # Training for 10 epochs
    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        # print(input_ids,' ',attention_mask,' ',labels,'\n')
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # Optimizer step
        optimizer.step()
        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

# Evaluating the BanglaBERT model
model.eval()
bangla_bert_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, _ = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        bangla_bert_predictions.extend(predictions.cpu().numpy())

## BanglaBERT Metrics

In [96]:
bangla_bert_predictions = bangla_bert_predictions.cpu().numpy() if torch.is_tensor(bangla_bert_predictions) else np.array(bangla_bert_predictions)
test_labels = test_labels.cpu().numpy() if torch.is_tensor(test_labels) else np.array(test_labels)

bangla_bert_accuracy = accuracy_score(test_labels, bangla_bert_predictions)
bangla_bert_precision = precision_score(test_labels, bangla_bert_predictions)
bangla_bert_recall = recall_score(test_labels, bangla_bert_predictions)
bangla_bert_f1 = f1_score(test_labels, bangla_bert_predictions)

print("BanglaBERT Model Metrics:")
print(f"Accuracy: {bangla_bert_accuracy * 100:.2f}%")
print(f"Precision: {bangla_bert_precision * 100:.2f}%")
print(f"Recall: {bangla_bert_recall * 100:.2f}%")
print(f"F1 Score: {bangla_bert_f1 * 100:.2f}%")

BanglaBERT Model Metrics:
Accuracy: 90.18%
Precision: 90.92%
Recall: 98.43%
F1 Score: 94.53%


## Live prediction function

In [None]:
def live_prediction(text):
    # Preprocess and tokenize the input text
    preprocessed_text = preprocess_text(text)

    svm_input = vectorizer.transform([preprocessed_text])
    svm_prediction = svm_model.predict(svm_input)
    svm_sentiment = 'Positive' if svm_prediction == 1 else 'Negative'


    encoded_input = tokenizer(preprocessed_text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    
    # Get prediction from BanglaBERT model
    model.eval()
    with torch.no_grad():
        output = model(**encoded_input)
        prediction = torch.argmax(output.logits, dim=1).cpu().numpy()[0]
    
    # Mapping predicted class to sentiment
    bangla_bert_sentiment = 'Positive' if prediction == 1 else 'Negative'
    return svm_sentiment, bangla_bert_sentiment

# Example: Get live prediction
user_input = input("Enter a product review for prediction: ")
svm_result, bangla_bert_result = live_prediction(user_input)
print(f"SVM Prediction: The sentiment of the review is {svm_result}")
print(f"BanglaBERT Prediction: The sentiment of the review is {bangla_bert_result}")