In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/MyDrive/Data Mining (CSE-4891)/Project files/dataset.csv'

In [3]:
import pandas as pd

df = pd.read_csv(file_path, delimiter=',', encoding='utf-8')

In [4]:
# Check unique classes and their counts and print the distribution
class_distribution = df['label'].value_counts()

print("Class Distribution:")
print(class_distribution)

Class Distribution:
label
Personal          4225
Geopolitical      3761
Religious         1714
Political         1596
Neutral            835
Gender abusive     316
Name: count, dtype: int64


In [5]:
df = df[df['label'] != 'Gender abusive']

# Check the new class distribution
class_distribution_after_drop = df['label'].value_counts()

# Print the updated distribution
print("Updated Class Distribution:")
print(class_distribution_after_drop)

Updated Class Distribution:
label
Personal        4225
Geopolitical    3761
Religious       1714
Political       1596
Neutral          835
Name: count, dtype: int64


In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:

In [9]:
import re
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset

# Clean text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

df['banglish_clean'] = df['banglish'].apply(clean_text)

# Label Encoding labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Splitting the data into train and test
X = df['banglish_clean']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Hugging Face Dataset
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

# Load Bangla BERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
model = AutoModelForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=len(label_encoder.classes_))

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
bangla_bert_accuracy = accuracy_score(y_test, preds)
print("\nBangla BERT Model Accuracy:", bangla_bert_accuracy)
print("\nBangla BERT Classification Report:\n", classification_report(y_test, preds, target_names=label_encoder.classes_))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9704 [00:00<?, ? examples/s]

Map:   0%|          | 0/2427 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.5153,1.427005
2,1.5335,1.42349
3,1.3602,1.412168



Bangla BERT Model Accuracy: 0.40832303255047386

Bangla BERT Classification Report:
               precision    recall  f1-score   support

Geopolitical       0.46      0.36      0.40       776
     Neutral       0.00      0.00      0.00       167
    Personal       0.39      0.83      0.53       848
   Political       0.46      0.02      0.04       284
   Religious       0.06      0.00      0.01       352

    accuracy                           0.41      2427
   macro avg       0.28      0.24      0.20      2427
weighted avg       0.35      0.41      0.32      2427



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Clean text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

df['banglish_clean'] = df['banglish'].apply(clean_text)

# Label Encoding labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Splitting the data into train and test
X = df['banglish_clean']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and pad sequences for LSTM
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
lstm_model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train LSTM model
lstm_model.fit(X_train_pad, y_train, batch_size=32, epochs=10, validation_data=(X_test_pad, y_test))

# Evaluate LSTM model
lstm_preds = np.argmax(lstm_model.predict(X_test_pad), axis=-1)
lstm_accuracy = accuracy_score(y_test, lstm_preds)
print("\nLSTM Model Accuracy:", lstm_accuracy)
print("\nLSTM Classification Report:\n", classification_report(y_test, lstm_preds, target_names=label_encoder.classes_))

# Naive Bayes Classifier
vectorizer = CountVectorizer(max_features=max_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Evaluate Naive Bayes model
nb_preds = nb_model.predict(X_test_vec)
nb_accuracy = accuracy_score(y_test, nb_preds)
print("\nNaive Bayes Model Accuracy:", nb_accuracy)
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_preds, target_names=label_encoder.classes_))



Epoch 1/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 191ms/step - accuracy: 0.4410 - loss: 1.4074 - val_accuracy: 0.7033 - val_loss: 0.8758
Epoch 2/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 190ms/step - accuracy: 0.7750 - loss: 0.6743 - val_accuracy: 0.7849 - val_loss: 0.6478
Epoch 3/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 184ms/step - accuracy: 0.8905 - loss: 0.3592 - val_accuracy: 0.8101 - val_loss: 0.6201
Epoch 4/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 185ms/step - accuracy: 0.9378 - loss: 0.2103 - val_accuracy: 0.8146 - val_loss: 0.6340
Epoch 5/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 184ms/step - accuracy: 0.9596 - loss: 0.1533 - val_accuracy: 0.8278 - val_loss: 0.7139
Epoch 6/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 189ms/step - accuracy: 0.9715 - loss: 0.1088 - val_accuracy: 0.8249 - val_loss: 0.7467
Epoch 7/10