# Nepali News Classification
## Using Kaggle Dataset with ML & BERT Models

**BUS 405: Foundations of Big Data Analytics**

---

### Dataset: Nepali News Dataset from Kaggle
- **Source**: https://www.kaggle.com/datasets/lotusacharya/nepalinewsdataset
- **Categories**: 10 news categories
- **Size**: ~10,000 articles (1000 per category)

## 1. Setup

In [None]:
# Install required packages
!pip install -q transformers torch scikit-learn pandas matplotlib seaborn kagglehub

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import glob
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# Set seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Download Dataset from Kaggle

### Option A: Using kagglehub (Easiest - Recommended)

In [None]:
# METHOD 1: Using kagglehub (Recommended)
import kagglehub

# Download the dataset - this returns the path to the downloaded folder
dataset_path = kagglehub.dataset_download("lotusacharya/nepalinewsdataset")

print(f"Dataset downloaded to: {dataset_path}")

# List files in the downloaded folder
print("\nFiles in dataset folder:")
for item in os.listdir(dataset_path):
    item_path = os.path.join(dataset_path, item)
    if os.path.isfile(item_path):
        size = os.path.getsize(item_path) / 1024  # KB
        print(f"  üìÑ {item} ({size:.1f} KB)")
    else:
        print(f"  üìÅ {item}/")

In [None]:
# METHOD 2: Using Kaggle API (Alternative)
# Uncomment if kagglehub doesn't work

# # First upload your kaggle.json
# from google.colab import files
# print("Upload your kaggle.json:")
# uploaded = files.upload()

# # Setup credentials
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

# # Download dataset
# !kaggle datasets download -d lotusacharya/nepalinewsdataset
# !unzip -o nepalinewsdataset.zip -d ./nepalinewsdataset
# dataset_path = "./nepalinewsdataset"

# print(f"Dataset downloaded to: {dataset_path}")

## 3. Load the Dataset from Downloaded Folder

In [None]:
# Find CSV file in the downloaded folder
def find_csv_file(folder_path):
    """
    Find CSV file in the downloaded Kaggle folder.
    """
    # Search for CSV files
    csv_files = glob.glob(os.path.join(folder_path, '**', '*.csv'), recursive=True)
    
    if csv_files:
        print(f"Found {len(csv_files)} CSV file(s):")
        for f in csv_files:
            print(f"  - {f}")
        return csv_files[0]  # Return the first one
    else:
        # Try direct file names
        possible_names = ['nepali_news.csv', 'news.csv', 'data.csv', 'nepalinewsdataset.csv']
        for name in possible_names:
            path = os.path.join(folder_path, name)
            if os.path.exists(path):
                return path
    
    return None

# Find the CSV file
csv_file = find_csv_file(dataset_path)

if csv_file:
    print(f"\n‚úÖ Using CSV file: {csv_file}")
else:
    print("‚ùå No CSV file found!")
    print("\nContents of dataset folder:")
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            print(f"  {os.path.join(root, file)}")

In [None]:
# Load the CSV file
df = pd.read_csv(csv_file)

print(f"‚úÖ Dataset loaded!")
print(f"   Shape: {df.shape}")
print(f"   Columns: {df.columns.tolist()}")

In [None]:
# View first few rows
df.head()

In [None]:
# Standardize column names
print("Original columns:", df.columns.tolist())

# Rename columns to standard names
# The Nepali News Dataset typically has columns like: 'news'/'content' and 'category'
rename_map = {}

for col in df.columns:
    col_lower = col.lower().strip()
    if col_lower in ['news', 'content', 'article', 'body', 'text', 'headline']:
        rename_map[col] = 'text'
    elif col_lower in ['category', 'label', 'class', 'type', 'target']:
        rename_map[col] = 'category'

if rename_map:
    df = df.rename(columns=rename_map)
    print(f"Renamed: {rename_map}")

print(f"Final columns: {df.columns.tolist()}")

# Check required columns exist
if 'text' not in df.columns or 'category' not in df.columns:
    print("\n‚ö†Ô∏è Could not find 'text' and 'category' columns automatically.")
    print("Available columns:", df.columns.tolist())
    print("\nPlease manually set the column names below:")
    # Manual override - uncomment and modify as needed
    # df = df.rename(columns={'YOUR_TEXT_COLUMN': 'text', 'YOUR_CATEGORY_COLUMN': 'category'})

## 4. Explore the Dataset

In [None]:
# Basic info
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"Total samples: {len(df)}")
print(f"Number of categories: {df['category'].nunique()}")
print(f"\nMissing values:")
print(df.isnull().sum())

In [None]:
# Remove missing values
df = df.dropna(subset=['text', 'category'])
print(f"After removing nulls: {len(df)} samples")

In [None]:
# Category distribution
print("\nCategory Distribution:")
print(df['category'].value_counts())

In [None]:
# Visualize categories
plt.figure(figsize=(12, 6))
ax = df['category'].value_counts().plot(kind='barh', color='steelblue')
plt.title('Nepali News Category Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Count')
plt.ylabel('Category')

# Add count labels
for i, v in enumerate(df['category'].value_counts()):
    ax.text(v + 10, i, str(v), va='center')

plt.tight_layout()
plt.show()

In [None]:
# Sample text from each category
print("\nSample news from each category:")
print("="*60)

for cat in df['category'].unique():
    sample = df[df['category'] == cat]['text'].iloc[0]
    sample_text = str(sample)[:100] + "..." if len(str(sample)) > 100 else str(sample)
    print(f"\nüì∞ [{cat}]")
    print(f"   {sample_text}")

## 5. Text Preprocessing

In [None]:
def preprocess_nepali_text(text):
    """
    Clean Nepali text for classification.
    """
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove numbers
    text = re.sub(r'[0-9‡•¶-‡•Ø]+', '', text)
    
    # Keep only Devanagari characters and spaces
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.strip()

# Apply preprocessing
print("Preprocessing text...")
df['clean_text'] = df['text'].apply(preprocess_nepali_text)

# Show example
print("\nExample:")
print(f"Original: {str(df['text'].iloc[0])[:80]}...")
print(f"Cleaned:  {df['clean_text'].iloc[0][:80]}...")

In [None]:
# Remove short texts
df = df[df['clean_text'].str.len() >= 20]
print(f"After removing short texts: {len(df)} samples")

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

print("Label Encoding:")
for i, cat in enumerate(label_encoder.classes_):
    count = (df['label'] == i).sum()
    print(f"  {i}: {cat} ({count} samples)")

num_classes = len(label_encoder.classes_)
print(f"\nTotal classes: {num_classes}")

## 6. Train/Test Split

In [None]:
# Split data
X = df['clean_text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

print(f"Training: {len(X_train)} samples")
print(f"Test: {len(X_test)} samples")

## 7. Traditional ML Models

In [None]:
# Create TF-IDF features
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=2)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF shape: {X_train_tfidf.shape}")

In [None]:
# Train and evaluate models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(random_state=RANDOM_SEED, max_iter=2000),
    'Logistic Regression': LogisticRegression(random_state=RANDOM_SEED, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
}

results = {}

print("="*60)
print("TRAINING ML MODELS")
print("="*60)

for name, model in models.items():
    print(f"\n‚ñ∂ Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {'accuracy': acc, 'f1_score': f1, 'model': model}
    print(f"  Accuracy: {acc:.4f}")
    print(f"  F1 Score: {f1:.4f}")

In [None]:
# Visualize results
results_df = pd.DataFrame({k: {'Accuracy': v['accuracy'], 'F1': v['f1_score']} 
                           for k, v in results.items()}).T

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(results_df))
width = 0.35

ax.bar(x - width/2, results_df['Accuracy'], width, label='Accuracy', color='#2ecc71')
ax.bar(x + width/2, results_df['F1'], width, label='F1 Score', color='#3498db')

ax.set_ylabel('Score')
ax.set_title('ML Models Performance', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(results_df.index, rotation=15)
ax.legend()
ax.set_ylim(0, 1.1)

for i, (acc, f1) in enumerate(zip(results_df['Accuracy'], results_df['F1'])):
    ax.text(i - width/2, acc + 0.02, f'{acc:.3f}', ha='center', fontsize=9)
    ax.text(i + width/2, f1 + 0.02, f'{f1:.3f}', ha='center', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Best model report
best_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_name]['model']
y_pred_best = best_model.predict(X_test_tfidf)

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_name}")
print(f"{'='*60}")
print(f"Accuracy: {results[best_name]['accuracy']:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_name}', fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 8. Deep Learning with mBERT

In [None]:
# Configuration
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
SAMPLE_SIZE = min(3000, len(df))  # Use sample for faster training

print(f"BERT Config: MAX_LENGTH={MAX_LENGTH}, BATCH_SIZE={BATCH_SIZE}, EPOCHS={EPOCHS}")
print(f"Using {SAMPLE_SIZE} samples for training")

In [None]:
# Load mBERT tokenizer
print("Loading mBERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
print("‚úÖ Tokenizer loaded!")

In [None]:
# Dataset class
class NepaliDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer.encode_plus(
            str(self.texts[idx]),
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Prepare BERT data
df_bert = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
X_b, y_b = df_bert['clean_text'].values, df_bert['label'].values

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.2, random_state=RANDOM_SEED, stratify=y_b
)
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(
    X_train_b, y_train_b, test_size=0.1, random_state=RANDOM_SEED, stratify=y_train_b
)

print(f"Train: {len(X_train_b)}, Val: {len(X_val_b)}, Test: {len(X_test_b)}")

In [None]:
# Create dataloaders
train_ds = NepaliDataset(X_train_b, y_train_b, tokenizer, MAX_LENGTH)
val_ds = NepaliDataset(X_val_b, y_val_b, tokenizer, MAX_LENGTH)
test_ds = NepaliDataset(X_test_b, y_test_b, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f"Batches - Train: {len(train_loader)}, Val: {len(val_loader)}, Test: {len(test_loader)}")

In [None]:
# Load mBERT model
print("Loading mBERT model...")
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased', num_labels=num_classes
).to(device)
print(f"‚úÖ Model loaded on {device}")

In [None]:
# Optimizer and scheduler
optimizer = AdamW(bert_model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*EPOCHS
)

In [None]:
# Training functions
def train_epoch(model, loader, opt, sched):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for batch in tqdm(loader, desc='Training'):
        opt.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        outputs.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        sched.step()
        total_loss += outputs.loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == batch['labels'].to(device)).sum().item()
        total += batch['labels'].size(0)
    return total_loss/len(loader), correct/total

def evaluate(model, loader):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            total_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch['labels'].to(device)).sum().item()
            total += batch['labels'].size(0)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].numpy())
    return total_loss/len(loader), correct/total, all_preds, all_labels

In [None]:
# Train mBERT
print("\n" + "="*60)
print("TRAINING mBERT")
print("="*60)

best_val_acc = 0
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_loss, train_acc = train_epoch(bert_model, train_loader, optimizer, scheduler)
    val_loss, val_acc, _, _ = evaluate(bert_model, val_loader)
    print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    print(f"  Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(bert_model.state_dict(), 'best_mbert.pt')
        print("  ‚úì Saved best model!")

In [None]:
# Test mBERT
bert_model.load_state_dict(torch.load('best_mbert.pt'))
test_loss, test_acc, y_pred_bert, y_true_bert = evaluate(bert_model, test_loader)
bert_f1 = f1_score(y_true_bert, y_pred_bert, average='weighted')

print(f"\n{'='*60}")
print("mBERT TEST RESULTS")
print(f"{'='*60}")
print(f"Accuracy: {test_acc:.4f}")
print(f"F1 Score: {bert_f1:.4f}")

results['mBERT'] = {'accuracy': test_acc, 'f1_score': bert_f1}

## 9. Final Comparison

In [None]:
# Compare all models
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)

final_df = pd.DataFrame({
    k: {'Accuracy': v['accuracy'], 'F1': v['f1_score']} 
    for k, v in results.items() if 'accuracy' in v
}).T.sort_values('Accuracy', ascending=False)

print(final_df)

In [None]:
# Final visualization
plt.figure(figsize=(12, 6))
x = np.arange(len(final_df))
width = 0.35

plt.bar(x - width/2, final_df['Accuracy'], width, label='Accuracy', color='#27ae60')
plt.bar(x + width/2, final_df['F1'], width, label='F1 Score', color='#3498db')

plt.ylabel('Score')
plt.title('All Models Comparison - Nepali News Classification', fontweight='bold')
plt.xticks(x, final_df.index, rotation=15)
plt.legend()
plt.ylim(0, 1.1)

for i, (acc, f1) in enumerate(zip(final_df['Accuracy'], final_df['F1'])):
    plt.text(i - width/2, acc + 0.02, f'{acc:.3f}', ha='center', fontsize=9)
    plt.text(i + width/2, f1 + 0.02, f'{f1:.3f}', ha='center', fontsize=9)

plt.tight_layout()
plt.show()

## 10. Prediction Function

In [None]:
def predict(text, use_bert=False):
    """Predict category for Nepali news text."""
    clean = preprocess_nepali_text(text)
    
    if use_bert:
        enc = tokenizer.encode_plus(clean, max_length=MAX_LENGTH, 
                                    padding='max_length', truncation=True, return_tensors='pt')
        bert_model.eval()
        with torch.no_grad():
            out = bert_model(input_ids=enc['input_ids'].to(device),
                           attention_mask=enc['attention_mask'].to(device))
            pred = torch.argmax(out.logits, dim=1).cpu().item()
    else:
        feat = tfidf.transform([clean])
        pred = best_model.predict(feat)[0]
    
    return label_encoder.inverse_transform([pred])[0]

# Test
test_texts = [
    "‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä‡§≤‡•á ‡§∏‡§Ç‡§∏‡§¶‡§Æ‡§æ ‡§®‡§Ø‡§æ‡§Å ‡§®‡•Ä‡§§‡§ø ‡§ò‡•ã‡§∑‡§£‡§æ ‡§ó‡§∞‡•á‡•§",
    "‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§ï‡•ç‡§∞‡§ø‡§ï‡•á‡§ü ‡§ü‡•ã‡§≤‡•Ä‡§≤‡•á ‡§µ‡§ø‡§∂‡•ç‡§µ‡§ï‡§™‡§Æ‡§æ ‡§ú‡§ø‡§§ ‡§π‡§æ‡§∏‡§ø‡§≤ ‡§ó‡§∞‡•ç‡§Ø‡•ã‡•§",
    "‡§∂‡•á‡§Ø‡§∞ ‡§¨‡§ú‡§æ‡§∞‡§Æ‡§æ ‡§Ü‡§ú ‡§â‡§≤‡•ç‡§≤‡•á‡§ñ‡•ç‡§Ø ‡§ó‡§ø‡§∞‡§æ‡§µ‡§ü ‡§Ü‡§Ø‡•ã‡•§"
]

print("\nPredictions:")
for text in test_texts:
    pred = predict(text)
    print(f"  '{text[:50]}...' ‚Üí {pred}")

In [None]:
print("\n" + "="*60)
print("‚úÖ NOTEBOOK COMPLETE!")
print("="*60)
print(f"\nDataset: {len(df)} Nepali news articles")
print(f"Categories: {num_classes}")
print(f"Best ML: {best_name} ({results[best_name]['accuracy']:.2%})")
print(f"mBERT: {test_acc:.2%}")
print("\nüá≥üáµ ‡§ß‡§®‡•ç‡§Ø‡§µ‡§æ‡§¶!")