In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [2]:
CSV_PATH = "nepali_news_part_0002.csv"
TEXT_COL = "news_context"
LABEL_COL = "label"

df = pd.read_csv(CSV_PATH, engine="python", on_bad_lines="skip")
df = df[[TEXT_COL, LABEL_COL]].dropna()
df[LABEL_COL] = df[LABEL_COL].astype(int)

print(df.shape)
print(df[LABEL_COL].value_counts())
df.head(2)

(8229, 2)
label
0    4227
1    4002
Name: count, dtype: int64


Unnamed: 0,news_context,label
0,संसदमा आज नयाँ संविधान संशोधनको प्रस्ताव स्वीक...,0
1,नेपाल बैंकले आज ब्याज दर ४.५% मा राख्ने निर्णय...,0


In [3]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df[LABEL_COL]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

In [4]:
MODEL_NAME = "xlm-roberta-base"   # best for Nepali
MAX_LENGTH = 256                  # keep 256 first (faster)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(batch[TEXT_COL], truncation=True, max_length=MAX_LENGTH)

train_ds = train_ds.map(tokenize_batch, batched=True)
test_ds  = test_ds.map(tokenize_batch, batched=True)

train_ds = train_ds.rename_column(LABEL_COL, "labels")
test_ds  = test_ds.rename_column(LABEL_COL, "labels")

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/6583 [00:00<?, ? examples/s]

Map:   0%|          | 0/1646 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaForSequenceClassification LOAD REPORT[0m from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.layer_norm.weight   | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
classifier.dense.bias       | MISSING    | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [9]:
OUT_DIR = "factify_xlmr_model"
# training_args = TrainingArguments(
#     output_dir=OUT_DIR,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,   # smaller for CPU
#     per_device_eval_batch_size=8,
#     num_train_epochs=2,              # start with 2 to test
#     weight_decay=0.01,
#     logging_steps=20,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1_macro",
#     report_to="none"
# )
training_args = TrainingArguments(
    output_dir="factify_xlmr_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    fp16=True,   # only works if CUDA is available
    report_to="none"
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.361251,0.104688,0.979951,0.979916
2,0.106112,0.104366,0.979344,0.979316


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=1646, training_loss=0.19547354233511163, metrics={'train_runtime': 4334.3546, 'train_samples_per_second': 3.038, 'train_steps_per_second': 0.38, 'total_flos': 346807093181040.0, 'train_loss': 0.19547354233511163, 'epoch': 2.0})

In [12]:
preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(y_true, y_pred, digits=4))

trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print("✅ Saved to:", OUT_DIR)

              precision    recall  f1-score   support

           0     0.9721    0.9882    0.9801       846
           1     0.9873    0.9700    0.9786       800

    accuracy                         0.9793      1646
   macro avg     0.9797    0.9791    0.9793      1646
weighted avg     0.9795    0.9793    0.9793      1646



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Saved to: factify_xlmr_model


In [8]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Torch version:", torch.__version__)
print("CUDA version seen by torch:", torch.version.cuda)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
Torch version: 2.5.1+cu121
CUDA version seen by torch: 12.1
GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

OUT_DIR = "factify_xlmr_model"

tok = AutoTokenizer.from_pretrained(OUT_DIR)
mdl = AutoModelForSequenceClassification.from_pretrained(OUT_DIR)
mdl.eval()

id2label = {0: "LABEL_0", 1: "LABEL_1"}  # rename to REAL/FAKE if you want

text = "नेपालका सबै होटलहरू २४ घण्टा खुला हुने घोषणा ग..."

inputs = tok(text, return_tensors="pt", truncation=True, max_length=128)

with torch.no_grad():
    out = mdl(**inputs)
    probs = torch.softmax(out.logits, dim=1).squeeze().cpu().numpy()
    pred = int(probs.argmax())

print("Prediction:", id2label[pred])
print("Confidence:", float(probs[pred]))
print("Probs:", {id2label[i]: float(p) for i, p in enumerate(probs)})

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Prediction: LABEL_1
Confidence: 0.9997389912605286
Probs: {'LABEL_0': 0.0002610130759421736, 'LABEL_1': 0.9997389912605286}


In [16]:
import pandas as pd
import numpy as np
import torch

sample_df = df.sample(10, random_state=20).reset_index(drop=True)

def predict_texts(texts):
    enc = tok(list(texts), return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        out = mdl(**enc)
        probs = torch.softmax(out.logits, dim=1).cpu().numpy()
        preds = probs.argmax(axis=1)
        confs = probs.max(axis=1)
    return preds, confs

pred_ids, confs = predict_texts(sample_df["news_context"])

sample_df["pred"] = pred_ids
sample_df["conf"] = confs
sample_df["pred_label"] = sample_df["pred"].map(id2label)
sample_df["true_label"] = sample_df["label"].map(id2label)

sample_df[["news_context", "true_label", "pred_label", "conf"]]

Unnamed: 0,news_context,true_label,pred_label,conf
0,कृषि मन्त्रालयले जैविक खेतीको लागि अनुदान घोषण...,LABEL_0,LABEL_0,0.999126
1,शीर्षक: निपालको नयाँ पर्यटन अभियान 'सुरु'\nकाठ...,LABEL_0,LABEL_0,0.999088
2,नेपाल पर्यटन बोर्डले २०२४ मा विदेशी पर्यटक संख...,LABEL_1,LABEL_1,0.999501
3,नेपाल पर्यटन बोर्डले नयाँ पर्यटन अभियान घोषणा ...,LABEL_0,LABEL_0,0.999169
4,नेपाल टेक्नोलोजी फोरमले २०२४ मा ५G नेटवर्कको प...,LABEL_0,LABEL_0,0.999202
5,नेपालको पर्यटन विभागले घोषणा गर्यो: २०२५ सम्मम...,LABEL_0,LABEL_0,0.998833
6,निपालमा सबै बैंकहरूमा अनधिकृत पहुँचको रिपोर्ट ...,LABEL_1,LABEL_1,0.999762
7,काठमाडौं – नयाँ एआई स्टार्टअप 'स्मार्टफ्यूचर' ...,LABEL_0,LABEL_0,0.999015
8,काठमाडौँ – पर्यटन मन्त्रालयले आजै घोषणा गर्‍यो...,LABEL_0,LABEL_0,0.999089
9,नेपालका सबै होटलहरू २४ घण्टा खुला हुने घोषणा ग...,LABEL_1,LABEL_1,0.999764
