In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

In [2]:
import pandas as pd
import numpy as np
import random
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pickle

2025-05-10 05:35:24.325005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746855324.476194      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746855324.519129      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
parameters = {
    'NEWS_SEQUENCE_LENGTH': 256,
    
    'TRAIN_BATCH_SIZE': 32,
    'EVAL_BATCH_SIZE': 32,
    'EPOCHS': 6,
    'LEARNING_RATE': 5e-5,
    'WARMUP_STEPS': 1_000,
    'GRADIENT_ACCUMULATION_STEPS': 4,
    'L2_REG': 0.01,
    # 'MAX_STEPS': 3664 # dataset_size / (BATCH*GRADIENT_ACCUMULATION_STEPS) = 1 epoch = 1221.375
}

In [4]:
df = pd.read_csv('/kaggle/input/nepali-news-classification-set/full_cleaned.csv')

In [5]:
df.head(3)

Unnamed: 0,title,news_stopwords,news_no_stopwords,category
0,निखिल उप्रेतीको भैरव फिल्मले अमेरिकामा रहेका न...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,मनोरञ्जन
1,सुशील कोइरालाको निधनपछि चौरासी बाले खोले यस्ता...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,मनोरञ्जन
2,लिटल प्रिन्स एण्ड प्रिन्सेसको ग्रान्ड फिनाले,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन्स एन्ड...,मनोरञ्जन


Apply same preprocessing as in the LSTM Network.

In [6]:
df.drop(columns=["title", "news_no_stopwords"], inplace=True)

In [7]:
df.describe()

Unnamed: 0,news_stopwords,category
count,369800,369800
unique,362052,10
top,थप जनामा कोरोना भाइरस संक्रमण पुष्टि भएको छ पछ...,राजनीति
freq,35,59975


In [8]:
df["length"] = df["news_stopwords"].apply(lambda x: len(x.split()))
df = df[df["length"] >= 30]

In [9]:
df["news_stopwords"] = df["news_stopwords"].apply(lambda x: " ".join(x.split()[:parameters["NEWS_SEQUENCE_LENGTH"]]))

In [10]:
df = df[df["category"] != "शिक्षा"]
df = df[df["category"] != "देश/प्रदेश"]

In [11]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

In [12]:
parameters["TOTAL_CATEGORIES"] = df['label'].nunique()

In [13]:
df.head(2)

Unnamed: 0,news_stopwords,category,length,label
0,अमेरिकाका नेपालीको प्रतिक्रिया लिने इच्छा व्यक...,मनोरञ्जन,268,2
1,झण्डै बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीको ...,मनोरञ्जन,406,2


In [14]:
def random_undersampling(data, random_state):
    if len(data) > 25000:
        return data.sample(25000, random_state=random_state)
    return data

In [15]:
df_balanced = pd.DataFrame(columns=['news_stopwords', 'label'])
for i in range(parameters["TOTAL_CATEGORIES"]):
    res = random_undersampling(df[df["label"] == i], i * np.random.randint(100))
    if df_balanced.empty == True:
        df_balanced = res
    else:
        df_balanced = pd.concat([df_balanced, res], ignore_index=True)

In [16]:
df_balanced["category"].value_counts()

category
अर्थ / वाणिज्य       25000
खेलकुद               25000
मनोरञ्जन             25000
राजनीति              25000
विश्व                25000
समाज                 25000
विज्ञान र प्रविधि    23072
स्वास्थ्य            22349
Name: count, dtype: int64

In [17]:
df_balanced["label"].value_counts()

label
0    25000
1    25000
2    25000
3    25000
5    25000
6    25000
4    23072
7    22349
Name: count, dtype: int64

In [18]:
for i in range(20):
    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

In [19]:
df_balanced.head(2)

Unnamed: 0,news_stopwords,category,length,label
0,तस्वीर: रासस धनगढीमा जारी प्रतियोगितामा टस हार...,खेलकुद,103,1
1,आँबुखैरेनी स्थित कन्यादेवी माध्यमिक विद्यालयमा...,समाज,277,6


In [20]:
X_train, X_val_test, y_train, y_val_test = train_test_split(df_balanced['news_stopwords'].tolist(), df_balanced['label'].tolist(), test_size=0.20, random_state=21, stratify=df_balanced["label"], shuffle=True)

In [21]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=28, stratify=y_val_test, shuffle=True)

In [22]:
len(X_train), len(X_test), len(X_val)

(156336, 19543, 19542)

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [24]:
class NepaliNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [25]:
train_dataset = NepaliNewsDataset(X_train, y_train, tokenizer)
test_dataset = NepaliNewsDataset(X_test, y_test, tokenizer)
val_dataset = NepaliNewsDataset(X_val, y_val, tokenizer)

### Define the evaluation metrics: Accuracy, Precision, Recall, and F-Score

In [26]:
def compute_metrics(preds):
    logits, labels = preds
    pred = logits.argmax(-1)
    precision, recall, fscore, _ = precision_recall_fscore_support(labels, pred, average='macro', zero_division=0)
    accuracy = accuracy_score(labels, pred)

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F-Score': fscore,
    }

In [27]:
training_args = TrainingArguments(
    output_dir="/kaggle/tmp/", 
    do_train=True,
    do_eval=True,
    overwrite_output_dir=True,
    eval_strategy='steps',
    eval_steps=4_800,
    per_device_train_batch_size=parameters['TRAIN_BATCH_SIZE'],
    per_device_eval_batch_size=parameters['EVAL_BATCH_SIZE'],
    warmup_steps=parameters['WARMUP_STEPS'],
    learning_rate=parameters['LEARNING_RATE'], 
    num_train_epochs=parameters['EPOCHS'],
    # max_steps=parameters['MAX_STEPS'],
    weight_decay=parameters['L2_REG'],  
    save_steps=4_800,
    save_total_limit=1,              
    load_best_model_at_end=True,     
    logging_dir="/kaggle/tmp/",            
    logging_steps=4_800,
    fp16=True,                     
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none",
)

In [28]:
parameters['TOTAL_CATEGORIES']

8

In [29]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=parameters['TOTAL_CATEGORIES'])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F-score
4800,0.531,0.415546,0.86169,0.86237,0.863406,0.857159
9600,0.374,0.390756,0.864299,0.868362,0.864424,0.863748
14400,0.3183,0.347734,0.879957,0.878909,0.879724,0.878038
19200,0.269,0.345029,0.885176,0.883303,0.885585,0.883919
24000,0.2174,0.359761,0.889474,0.887584,0.889911,0.888309
28800,0.1648,0.408303,0.8862,0.885128,0.886331,0.885632


TrainOutput(global_step=29316, training_loss=0.3098760629061349, metrics={'train_runtime': 22458.4649, 'train_samples_per_second': 41.767, 'train_steps_per_second': 1.305, 'total_flos': 1.2340783766922854e+17, 'train_loss': 0.3098760629061349, 'epoch': 6.0})

In [32]:
print(trainer.evaluate(eval_dataset=test_dataset))

{'eval_loss': 0.34502872824668884, 'eval_Accuracy': 0.8851762779511846, 'eval_Precision': 0.8833025915716493, 'eval_Recall': 0.885585499905009, 'eval_F-Score': 0.8839186733199047, 'eval_runtime': 142.9273, 'eval_samples_per_second': 136.734, 'eval_steps_per_second': 4.275, 'epoch': 6.0}


In [33]:
def predict(model, tokenizer, text, label_encoder, input_sequence_length=256):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=input_sequence_length, return_tensors='pt').to('cuda')
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()
    pred = label_encoder.inverse_transform([pred_label_idx.cpu().numpy()])[0]

    return pred

In [34]:
predict(model, tokenizer, random.choice(X_val[10:200]), le, parameters['NEWS_SEQUENCE_LENGTH'])

'स्वास्थ्य'

In [35]:
trainer.save_model('/kaggle/tmp/BERT_nepali_news_classifier_model')
tokenizer.save_pretrained('/kaggle/tmp/BERT_nepali_news_classifier_tokenizer')

('/kaggle/tmp/BERT_nepali_news_classifier_tokenizer/tokenizer_config.json',
 '/kaggle/tmp/BERT_nepali_news_classifier_tokenizer/special_tokens_map.json',
 '/kaggle/tmp/BERT_nepali_news_classifier_tokenizer/vocab.txt',
 '/kaggle/tmp/BERT_nepali_news_classifier_tokenizer/added_tokens.json')

In [36]:
with open('/kaggle/tmp/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [37]:
import tarfile
with tarfile.open("/kaggle/working/model_outputs.tar.gz", "w:gz") as tar:
    tar.add("/kaggle/tmp", arcname="model_outputs")