In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Define device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Downloading (…)okenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import pandas as pd
train_url = '/kaggle/input/bec-dataset/train_data.csv'
test_url = '/kaggle/input/bec-dataset/test_data.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)

In [4]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [5]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [6]:
import numpy as np
allcats = set(df_train['Category'].dropna().tolist())
allcats

{'Code Switching', 'Grammatical', 'Multiple Errors', 'Spelling'}

In [7]:
allcats.add('Correct')
labeldict = {}
counter = 0
for i in allcats:
    labeldict[i] = counter
    counter += 1
labeldict

{'Grammatical': 0,
 'Spelling': 1,
 'Correct': 2,
 'Multiple Errors': 3,
 'Code Switching': 4}

In [8]:
def manage(x):
    if x in labeldict:
        return labeldict[x]
    else:
        return labeldict['Correct']
df_train['Category'] = df_train['Category'].apply(lambda x:manage(x))
df_test['Category'] = df_test['Category'].apply(lambda x:manage(x))

In [9]:
df_train

Unnamed: 0,Video Title,Genre,Comment,Error,Category,Correct Form
0,"ওবায়দুল কাদের বললেন, ‘খেলা হবে’; আর রুমিন ফারহ...",Politics,কাদের খেলব কাদের খেলতেই না,1,0,কাদের কি খেলবে কাদের তো খেলতেই পারে না
1,পুলিশের গাড়ির ওপর চড়াও বিএনপির কর্মীরা | BN...,Politics,এসব আরো ঠাসা,1,1,এসব করে আরো কোণঠাসা হবে
2,Ayub Bachchu | Ek Akash Tara | আইয়ুব বাচ্চু |...,Entertainment,যুগ যুগ গেথে গান,0,2,যুগ যুগ ধরে আমাদের মনে গেথে থাকবে এ গান
3,যে প্রেম কাহিনী কোন বাধা মানেনি | BBC Bangla,Miscellaneous,অাচছা অাপু এলাজী থাকলে টিকা জাবেনা,1,1,আচ্ছা আপু এলার্জী থাকলে টিকা নেওয়া যাবেনা
4,তুরস্কের চেয়ে ভয়াবহ ভূমিকম্পের ঝুঁকিতে বাংলাদে...,News,হে আল্লাহ জালিমদের সন্তান সন্তদের হেফাজত,0,2,হে আল্লাহ এই জালিমদের থেকে আমাদের সন্তান সন্তদ...
...,...,...,...,...,...,...
8027,Shitom Ahmed - Chorabali (Lyrics) || কেন লাগে ...,Entertainment,সত্যি শুন্য লাগে,0,2,সত্যি তাকে ছাড়া খুব শুন্য লাগে
8028,হৃদয় ছুঁয়ে যাওয়া ৭টি সেরা ইমোশনাল বিজ্ঞাপন ...,Entertainment,বিজ্ঞাপন গুলো চোখে পানি আসলো ভাই,0,2,বিজ্ঞাপন গুলো দেখে চোখে পানি চলে আসলো ভাই
8029,আইপিএলের নিলাম তালিকায় পাঁচ বাংলাদেশি | IPL | ...,Sports,টাকা সবদিক,1,1,একবার যখন টাকা হয় তখন সব দিক দিয়ে আসে
8030,মাহমুদুল্লাহর সেরা ১০টি ইনিংস || 10 Greatest I...,Sports,অসাধারণ ইউটিউব ভিডিও সাথে চমৎকার ব্যাকগ্রাউন্ড,0,2,আমার দেখা অসাধারণ ইউটিউব ভিডিও তার সাথে চমৎকার...


In [10]:
data_no = 5

# Prepare the training data
train_texts = df_train['Comment'].tolist()
train_labels = df_train['Category'].tolist()

test_texts = df_test['Comment'].tolist()
test_labels = df_test['Category'].tolist()

In [11]:
# Tokenize and encode the training texts
train_encodings = tokenizer(train_texts, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')

# Convert the labels to tensors
train_labels = torch.tensor(train_labels)

# Create a PyTorch dataset
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                               train_encodings['attention_mask'],
                                               train_labels)

# Create a data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

model = model.to(device)

In [12]:
from tqdm.notebook import tqdm
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import accuracy_score

# Set the model to training mode
model.train()

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

losses = []
accuracies = []  # To store accuracy per epoch
num_epochs = 5
# Training loop
for epoch in tqdm(range(num_epochs)):  # Number of training epochs
    running_loss = 0.0
    predicted_labels = []  # To store predicted labels for accuracy calculation
    true_labels = []  # To store true labels for accuracy calculation

    for batch in tqdm(train_loader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Convert logits to predicted labels
        _, predicted = torch.max(logits, dim=1)
        predicted_labels.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

    epoch_loss = running_loss / len(train_loader)
    losses.append(epoch_loss)

    # Calculate and store accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    accuracies.append(accuracy)

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {accuracy:.4f}')

# Save the model
torch.save(model.state_dict(), 'model.pth')


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 1/5 - Loss: 0.9960 - Accuracy: 0.6662


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 2/5 - Loss: 0.7520 - Accuracy: 0.7561


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 3/5 - Loss: 0.6338 - Accuracy: 0.8035


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 4/5 - Loss: 0.5295 - Accuracy: 0.8391


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 5/5 - Loss: 0.4349 - Accuracy: 0.8704


In [13]:
#dgfdgdfgdgffdgdfd1212jhkhk

In [14]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def predict_labels(text):
    train_encodings = tokenizer(text, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')
    input_ids = train_encodings['input_ids'].to(device)
    attention_mask = train_encodings['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)
    

    return predicted_class.item(), probabilities[:,1].item()

In [15]:
predicted_labels = []
predicted_probs = []
for text in tqdm(test_texts):
    predicted_label, prob = predict_labels(text)
    predicted_labels.append(predicted_label)
    predicted_probs.append(prob)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predicted_labels)
# f1 = f1_score(test_labels, predicted_labels)
# roc_auc = roc_auc_score(test_labels, predicted_probs)

print('Accuracy:', accuracy)
# print('F1 Score:', f1)
# print('ROC-AUC:', roc_auc)

  0%|          | 0/2010 [00:00<?, ?it/s]

Accuracy: 0.7512437810945274


In [16]:
print('Accuracy:', accuracy)

Accuracy: 0.7512437810945274


In [17]:
from sklearn.metrics import roc_auc_score, classification_report

print('\nThe Classification Report is as follows\n')
print(classification_report(test_labels, predicted_labels, digits = 4))


The Classification Report is as follows

              precision    recall  f1-score   support

           0     0.5000    0.2500    0.3333       128
           1     0.6710    0.7249    0.6969       498
           2     0.8153    0.8583    0.8362      1157
           3     0.4762    0.1449    0.2222        69
           4     0.6746    0.7215    0.6972       158

    accuracy                         0.7512      2010
   macro avg     0.6274    0.5399    0.5572      2010
weighted avg     0.7367    0.7512    0.7377      2010

