In [1]:
!pip install transformers



In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

In [3]:
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)  # Adjust the number of labels as needed


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [4]:
# Load your CSV file
df = pd.read_csv("book_review_train.csv")
print(df.head())
print(df.tail())
texts = df["Data"].tolist()
labels = df["Sentiment"].tolist()

                                                Data  Sentiment
0                                   ভালোই কিনতু বইটা          1
1  পড়লাম বইটা মাঝখানে কিছুটা বোরিং মনে হয়েছে অনুব...          0
2  বোরিং বোরিং বোরং ছোট গলপটির বইটি নেওয়া উচিত ছ...          0
3                                 বইটি চমৎকার লেগেছে          1
4    অসাধারণ বই লজিকাল সকিল ডেভেলপমেনট জনযে অতুলনীয়          1
                                                  Data  Sentiment
995                                     এককথায় অসাধারণ          1
996  হুমায়ুন আহমেদ সযার উপনযাসটি এক অতুলনীয় সৃষটি...          1
997  বইটিতে সতযিই হতাশ হয়েছি মনে হয়েছিল জীবনের কয...          0
998  কেবল একগুচছ করেস অনাহত সমৃতি অরথহীন নিমন সতরের...          0
999                                            ভালো বই          1


In [6]:
# Tokenize your text data
tokenized_texts = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

In [7]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts["input_ids"][idx]
        attention_mask = self.tokenized_texts["attention_mask"][idx]
        label = self.labels[idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

train_dataset = CustomDataset(tokenized_texts, labels)



In [8]:
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir="./sentiment_model",
    evaluation_strategy="steps",
    num_train_epochs=3,
    save_steps=100,
    save_total_limit=4,
)

# Create a DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Initialize Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

trainer.save_model()


***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 278045186


Step,Training Loss,Validation Loss


Saving model checkpoint to ./sentiment_model\checkpoint-100
Configuration saved in ./sentiment_model\checkpoint-100\config.json
Model weights saved in ./sentiment_model\checkpoint-100\pytorch_model.bin
Saving model checkpoint to ./sentiment_model\checkpoint-200
Configuration saved in ./sentiment_model\checkpoint-200\config.json
Model weights saved in ./sentiment_model\checkpoint-200\pytorch_model.bin
Saving model checkpoint to ./sentiment_model\checkpoint-300
Configuration saved in ./sentiment_model\checkpoint-300\config.json
Model weights saved in ./sentiment_model\checkpoint-300\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./sentiment_model
Configuration saved in ./sentiment_model\config.json
Model weights saved in ./sentiment_model\pytorch_model.bin


In [12]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("C:\\Users\\akash\\sentiment_model")

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\akash/.cache\huggingface\hub\models--xlm-roberta-base\snapshots\77de1f7a7e5e737aead1cd880979d4f1b3af6668\config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading file sente

In [13]:
# Load your test CSV file
df_test = pd.read_csv("book_review_test.csv")  # Replace with the path to your test dataset

# Extract the test texts and labels
test_texts = df_test["Data"].tolist()
test_labels = df_test["Sentiment"].tolist()

# Tokenize the test text data
tokenized_test_texts = tokenizer(test_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Define a custom dataset class for test data
class CustomTestDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts["input_ids"][idx]
        attention_mask = self.tokenized_texts["attention_mask"][idx]
        label = self.labels[idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

test_dataset = CustomTestDataset(tokenized_test_texts, test_labels)

# Create a DataLoader for test data
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Evaluate the model on the test dataset
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to predicted labels
        predicted = torch.argmax(logits, dim=1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
classification_rep = classification_report(true_labels, predicted_labels)
confusion_mtx = confusion_matrix(true_labels, predicted_labels)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mtx)


Accuracy: 0.7976190476190477
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.74      0.80        91
           1       0.74      0.87      0.80        77

    accuracy                           0.80       168
   macro avg       0.80      0.80      0.80       168
weighted avg       0.81      0.80      0.80       168

Confusion Matrix:
 [[67 24]
 [10 67]]


In [14]:
# Input paragraph
input_paragraph = "বাংলাদেশ একটি সুন্দর দেশ। এখানে সুন্দর প্রদেশের দৃশ্য, সহনীয় মানুষ, এবং প্রাচীন ঐতিহ্য রয়েছে। বাংলাদেশে বর্ষা আসলে সবকিছু সবুজে লিপটে যায়। এখানে পাট, জুতা, তাঁত, বাংলাদেশের প্রধান কৃষি ও শিল্পক্ষেত্রের প্রধান আয়োর উৎস।"

# Split the paragraph into sentences using "।" and "?"
sentences = re.split(r'[।?]', input_paragraph)

# Define a function to get sentiment, positive percentage, and negative percentage for a sentence
def get_sentiment_and_percentages(input_text):
    tokenized_input = tokenizer(input_text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    output = model(**tokenized_input, return_dict=True)

    # Get the probability distribution over classes
    probabilities = torch.softmax(output.logits, dim=1)
    positive_percentage = probabilities[0, 1].item() * 100  # Percentage for positive class
    negative_percentage = probabilities[0, 0].item() * 100  # Percentage for negative class

    # Map the predicted label to sentiment
    predicted_label = output.logits.argmax().item()
    sentiment_mapping = {0: "negative", 1: "positive"}
    sentiment = sentiment_mapping[predicted_label]

    return sentiment, positive_percentage, negative_percentage

# Analyze the sentiment, positive percentage, and negative percentage of each sentence
for sentence in sentences:
    # Skip empty sentences
    if not sentence.strip():
        continue

    sentiment, positive_percentage, negative_percentage = get_sentiment_and_percentages(sentence)
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print(f"Positive Percentage: {positive_percentage:.2f}%")
    print(f"Negative Percentage: {negative_percentage:.2f}%")
    print("\n")

Sentence: বাংলাদেশ একটি সুন্দর দেশ
Sentiment: positive
Positive Percentage: 90.79%
Negative Percentage: 9.21%


Sentence:  এখানে সুন্দর প্রদেশের দৃশ্য, সহনীয় মানুষ, এবং প্রাচীন ঐতিহ্য রয়েছে
Sentiment: positive
Positive Percentage: 80.48%
Negative Percentage: 19.52%


Sentence:  বাংলাদেশে বর্ষা আসলে সবকিছু সবুজে লিপটে যায়
Sentiment: negative
Positive Percentage: 48.85%
Negative Percentage: 51.15%


Sentence:  এখানে পাট, জুতা, তাঁত, বাংলাদেশের প্রধান কৃষি ও শিল্পক্ষেত্রের প্রধান আয়োর উৎস
Sentiment: positive
Positive Percentage: 77.44%
Negative Percentage: 22.56%


