In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file_path = '/content/drive/MyDrive/Data Mining (CSE-4891)/Project files/dataset.csv'
import pandas as pd

df = pd.read_csv(file_path, delimiter=',', encoding='utf-8')

In [5]:
# Check unique classes and their counts and print the distribution
class_distribution = df['label'].value_counts()

print("Class Distribution:")
print(class_distribution)

Class Distribution:
label
Personal          4225
Geopolitical      3761
Religious         1714
Political         1596
Neutral            835
Gender abusive     316
Name: count, dtype: int64


In [6]:
df = df[df['label'] != 'Gender abusive']

# Check the new class distribution
class_distribution_after_drop = df['label'].value_counts()

# Print the updated distribution
print("Updated Class Distribution:")
print(class_distribution_after_drop)

Updated Class Distribution:
label
Personal        4225
Geopolitical    3761
Religious       1714
Political       1596
Neutral          835
Name: count, dtype: int64


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:0

In [7]:
import re
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, accuracy_score
import datasets
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Clean text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

df['banglish_clean'] = df['banglish'].apply(clean_text)

# Label Encoding labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Splitting the data into train and test
X = df['banglish_clean']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Hugging Face Dataset
train_dataset = datasets.Dataset.from_dict({'text': X_train, 'label': y_train})
test_dataset = datasets.Dataset.from_dict({'text': X_test, 'label': y_test})

# Load Bangla BERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
bert_model = AutoModel.from_pretrained('sagorsarker/bangla-bert-base')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert tokenized dataset to numpy arrays
X_train_tokens = np.array(train_dataset['input_ids'])
X_test_tokens = np.array(test_dataset['input_ids'])
y_train = np.array(y_train)
y_test = np.array(y_test)

# Build LSTM model
embedding_matrix = bert_model.embeddings.word_embeddings.weight.detach().numpy()
vocab_size, embedding_dim = embedding_matrix.shape

lstm_model = Sequential()
lstm_model.add(Input(shape=(128,)))
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=128, trainable=False))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=2e-5), metrics=['accuracy'])

# Train LSTM model
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
lstm_model.fit(X_train_tokens, y_train, batch_size=32, epochs=10, validation_data=(X_test_tokens, y_test), callbacks=[early_stopping])

# Evaluate LSTM model
lstm_preds = np.argmax(lstm_model.predict(X_test_tokens), axis=-1)
lstm_accuracy = accuracy_score(y_test, lstm_preds)
print("\nBangla BERT + LSTM Model Accuracy:", lstm_accuracy)
print("\nBangla BERT + LSTM Classification Report:\n", classification_report(y_test, lstm_preds, target_names=label_encoder.classes_))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Map:   0%|          | 0/9704 [00:00<?, ? examples/s]

Map:   0%|          | 0/2427 [00:00<?, ? examples/s]



Epoch 1/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 378ms/step - accuracy: 0.2939 - loss: 1.5677 - val_accuracy: 0.3494 - val_loss: 1.4587
Epoch 2/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 394ms/step - accuracy: 0.3255 - loss: 1.4883 - val_accuracy: 0.3494 - val_loss: 1.4566
Epoch 3/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 372ms/step - accuracy: 0.3379 - loss: 1.4817 - val_accuracy: 0.3494 - val_loss: 1.4553
Epoch 4/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 369ms/step - accuracy: 0.3383 - loss: 1.4802 - val_accuracy: 0.3494 - val_loss: 1.4556
Epoch 5/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 372ms/step - accuracy: 0.3400 - loss: 1.4752 - val_accuracy: 0.3494 - val_loss: 1.4549
Epoch 6/10
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 403ms/step - accuracy: 0.3314 - loss: 1.4807 - val_accuracy: 0.3494 - val_loss: 1.4520
Epoc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
