In [1]:

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

In [2]:
# Initialize the tokenizer and model (using "bert-base-multilingual-cased" model)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)  # Adjust num_labels as needed

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceCl

In [3]:
# Load your CSV file for training
df = pd.read_csv("book_review_train.csv")
print(df.head())
texts = df["Data"].tolist()
labels = df["Sentiment"].tolist()

                                                Data  Sentiment
0                                   ভালোই কিনতু বইটা          1
1  পড়লাম বইটা মাঝখানে কিছুটা বোরিং মনে হয়েছে অনুব...          0
2  বোরিং বোরিং বোরং ছোট গলপটির বইটি নেওয়া উচিত ছ...          0
3                                 বইটি চমৎকার লেগেছে          1
4    অসাধারণ বই লজিকাল সকিল ডেভেলপমেনট জনযে অতুলনীয়          1


In [4]:
# Tokenize your text data
tokenized_texts = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts["input_ids"][idx]
        attention_mask = self.tokenized_texts["attention_mask"][idx]
        label = self.labels[idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

train_dataset = CustomDataset(tokenized_texts, labels)

In [5]:
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir="./sentiment_model_bert_base",
    evaluation_strategy="steps",
    num_train_epochs=3,
    save_steps=100,
    save_total_limit=2,
)

# Create a DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Initialize Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

trainer.save_model()


***** Running training *****
  Num examples = 148
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 57
  Number of trainable parameters = 177854978


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./sentiment_model_bert_base
Configuration saved in ./sentiment_model_bert_base\config.json
Model weights saved in ./sentiment_model_bert_base\pytorch_model.bin


In [None]:
# Load the tokenizer (using "bert-base-multilingual-cased" model)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("C:\\Users\\akash\\sentiment_model_bert_base")


In [9]:
# Load your test CSV file
df_test = pd.read_csv("book_review_test.csv")  # Replace with the path to your test dataset

# Extract the test texts and labels
test_texts = df_test["Data"].tolist()
test_labels = df_test["Sentiment"].tolist()

# Tokenize the test text data
tokenized_test_texts = tokenizer(test_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

In [10]:
# Define a custom dataset class for test data
class CustomTestDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts["input_ids"][idx]
        attention_mask = self.tokenized_texts["attention_mask"][idx]
        label = self.labels[idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

test_dataset = CustomTestDataset(tokenized_test_texts, test_labels)

In [11]:
# Create a DataLoader for test data
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Evaluate the model on the test dataset
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to predicted labels
        predicted = torch.argmax(logits, dim=1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
classification_rep = classification_report(true_labels, predicted_labels)
confusion_mtx = confusion_matrix(true_labels, predicted_labels)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mtx)


Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.73      0.80        22
           1       0.80      0.92      0.86        26

    accuracy                           0.83        48
   macro avg       0.84      0.83      0.83        48
weighted avg       0.84      0.83      0.83        48

Confusion Matrix:
 [[16  6]
 [ 2 24]]


In [14]:
# Input paragraph
input_paragraph = "বাংলাদেশ একটি সুন্দর দেশ। এখানে সুন্দর প্রদেশের দৃশ্য, সহনীয় মানুষ, এবং প্রাচীন ঐতিহ্য রয়েছে। বাংলাদেশে বর্ষা আসলে সবকিছু সবুজে লিপটে যায়। এখানে পাট, জুতা, তাঁত, বাংলাদেশের প্রধান কৃষি ও শিল্পক্ষেত্রের প্রধান আয়োর উৎস।"

# Split the paragraph into sentences using "।" and "?"
sentences = re.split(r'[।?]', input_paragraph)

# Define a function to get sentiment, positive percentage, and negative percentage for a sentence
def get_sentiment_and_percentages(input_text):
    tokenized_input = tokenizer(input_text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    output = model(**tokenized_input, return_dict=True)

    # Get the probability distribution over classes
    probabilities = torch.softmax(output.logits, dim=1)
    positive_percentage = probabilities[0, 1].item() * 100  # Percentage for the positive class
    negative_percentage = probabilities[0, 0].item() * 100  # Percentage for the negative class

    # Map the predicted label to sentiment
    predicted_label = output.logits.argmax().item()
    sentiment_mapping = {0: "negative", 1: "positive"}
    sentiment = sentiment_mapping[predicted_label]

    return sentiment, positive_percentage, negative_percentage

# Analyze the sentiment, positive percentage, and negative percentage of each sentence
for sentence in sentences:
    # Skip empty sentences
    if not sentence.strip():
        continue

    sentiment, positive_percentage, negative_percentage = get_sentiment_and_percentages(sentence)
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print(f"Positive Percentage: {positive_percentage:.2f}%")
    print(f"Negative Percentage: {negative_percentage:.2f}%")
    print("\n")

Sentence: বাংলাদেশ একটি সুন্দর দেশ
Sentiment: positive
Positive Percentage: 95.10%
Negative Percentage: 4.90%


Sentence:  এখানে সুন্দর প্রদেশের দৃশ্য, সহনীয় মানুষ, এবং প্রাচীন ঐতিহ্য রয়েছে
Sentiment: positive
Positive Percentage: 78.12%
Negative Percentage: 21.88%


Sentence:  বাংলাদেশে বর্ষা আসলে সবকিছু সবুজে লিপটে যায়
Sentiment: positive
Positive Percentage: 66.09%
Negative Percentage: 33.91%


Sentence:  এখানে পাট, জুতা, তাঁত, বাংলাদেশের প্রধান কৃষি ও শিল্পক্ষেত্রের প্রধান আয়োর উৎস
Sentiment: positive
Positive Percentage: 83.62%
Negative Percentage: 16.38%


