In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.optim import SGD
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
pip install git+https://github.com/csebuetnlp/normalizer

In [None]:

# Load the dataset
df = pd.read_csv('/kaggle/input/muril-dataset/cleaned-data2 - Sheet1.csv')

In [None]:
#data cleaning
def find_substring(input_string):
    start_word="রকমারি"
    end_word= "রিভিউঃ"
    start_index = input_string.find(start_word)
    end_index = input_string.rfind(end_word) + len(end_word)
    if start_index != -1 and end_index != -1:

        return input_string[end_index:]

    else:
        return input_string

df['summary'] = df['summary'].apply(find_substring)

In [None]:
from normalizer import normalize

def normalizer_function(input_text):
  normalized_text = normalize(
    input_text,
    unicode_norm="NFKC",          # type of unicode normalization (default "NFKC")
    punct_replacement=None,       # an optional string or callable for replacing the punctuations (default `None`, i.e. no replacement)
    url_replacement=None,         # an optional string or callable for replacing the URLS (default `None`, i.e. no replacement)
    emoji_replacement=None,       # an optional string or callable for replacing the emojis (default `None`, i.e. no replacement)
    apply_unicode_norm_last=True  # whether to apply the unicode normalization before or after rule based replacements (default True)        
  )
  return normalized_text

df['summary'] = df['summary'].apply(normalizer_function)

In [None]:
# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['summary'], df['Label'], test_size=0.2)
train_labels = train_labels.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)

In [None]:
# Load the MuRIL tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=7)

model = model.to(device)
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
    
# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Create a PyTorch Dataset
class BanglaSummaryDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create DataLoaders
train_dataset = BanglaSummaryDataset(train_encodings, train_labels)
val_dataset = BanglaSummaryDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

In [None]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs=50

# Train the model
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.mean()  # Take the mean of the loss
        loss.backward()
        optimizer.step()

#     model.eval()
#     for batch in tqdm(val_loader, desc="Validation"):
#         with torch.no_grad():
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             val_loss = outputs.loss.mean()  # Take the mean of the loss
    if epoch % 5 == 0:
        model.eval()
        y_true = []
        y_pred = []
        for batch in tqdm(val_loader, desc="Validation"):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss = outputs.loss.mean()  # Take the mean of the loss
                logits = outputs.logits.detach().cpu().numpy()
                predictions = np.argmax(logits, axis=1)
                y_true.extend(labels.detach().cpu().numpy())
                y_pred.extend(predictions)

        macro_f1 = f1_score(y_true, y_pred, average='macro')
        print(f"Macro F1 score after {epoch+1} epochs: {macro_f1:.4f}")



In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Initialize an empty list to save the predictions
predictions = []
true_labels = []

# Predict
model.eval()
for batch in tqdm(val_loader, desc="Predicting"):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        
    # Move logits and labels to CPU
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

# Flatten the outputs
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# For each sample, pick the label (0 or 1) with the higher score
pred_flat = np.argmax(predictions, axis=1).flatten()

# Calculate the Macro F1 Score
macro_f1 = f1_score(true_labels, pred_flat, average='macro')

print('Macro F1 Score:', macro_f1)
