In [9]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
print(dataset["train"][0])

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [10]:
def map_label_to_category(label):
    # Original ag_news labels
    # 0: World
    # 1: Sports
    # 2: Business
    # 3: Sci/Tech
    if label == 0:
        return "Politics"       # World news → Politics / General
    elif label == 1:
        return "Sports"
    elif label == 2:
        return "Business"
    elif label == 3:
        return "Health"         # Sci/Tech → Health / Lifestyle (approximation)


In [11]:
def add_category_column(example):
    example["category"] = map_label_to_category(example["label"])
    return example

dataset = dataset.map(add_category_column)


Map: 100%|██████████| 120000/120000 [00:12<00:00, 9394.89 examples/s] 
Map: 100%|██████████| 7600/7600 [00:00<00:00, 9755.81 examples/s] 


In [12]:
from collections import Counter

categories = [x["category"] for x in dataset["train"]]
print(Counter(categories))


Counter({'Business': 30000, 'Health': 30000, 'Sports': 30000, 'Politics': 30000})


In [13]:
from datasets import load_dataset
from collections import Counter
import pandas as pd

# Step 1: Load dataset
dataset = load_dataset("ag_news")

# Step 2: Map labels to Nigerian-like categories
def map_label_to_category(label):
    if label == 0:
        return "Politics"   # World → Politics
    elif label == 1:
        return "Sports"
    elif label == 2:
        return "Business"
    elif label == 3:
        return "Health"     # Sci/Tech → Health (approximate)

# Step 3: Add new category column
def add_category_column(example):
    example["category"] = map_label_to_category(example["label"])
    return example

dataset = dataset.map(add_category_column)

# Step 4: Check category distribution
categories = [x["category"] for x in dataset["train"]]
print(Counter(categories))

# Optional: Save to CSV for later ML work
df = pd.DataFrame(dataset["train"])
df.to_csv("nigerian_news_dataset.csv", index=False)
print("✅ Saved processed dataset to nigerian_news_dataset.csv")


Map: 100%|██████████| 120000/120000 [00:14<00:00, 8523.24 examples/s] 
Map: 100%|██████████| 7600/7600 [00:00<00:00, 9313.76 examples/s] 


Counter({'Business': 30000, 'Health': 30000, 'Sports': 30000, 'Politics': 30000})
✅ Saved processed dataset to nigerian_news_dataset.csv


In [14]:
# Balance dataset: 200 per category
balanced_df = df.groupby("category").apply(lambda x: x.sample(200, random_state=42)).reset_index(drop=True)

print(balanced_df["category"].value_counts())
balanced_df.to_csv("nigerian_news_balanced.csv", index=False)
print("✅ Saved balanced dataset to nigerian_news_balanced.csv")


category
Business    200
Health      200
Politics    200
Sports      200
Name: count, dtype: int64
✅ Saved balanced dataset to nigerian_news_balanced.csv


In [15]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df["category"])

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train size: 640
Test size: 160


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from joblib import Memory

# Create a memory cache folder
memory = Memory(location="./cachedir", verbose=0)

# Build pipeline
from sklearn.linear_model import LogisticRegression
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=500, random_state=42))
], memory=memory)



# Train
model.fit(train_df["text"], train_df["category"])

# Test
y_pred = model.predict(test_df["text"])


print("Accuracy:", accuracy_score(test_df["category"], y_pred))
print(classification_report(test_df["category"], y_pred))


Accuracy: 0.825
              precision    recall  f1-score   support

    Business       0.74      0.70      0.72        40
      Health       0.71      0.75      0.73        40
    Politics       0.90      0.88      0.89        40
      Sports       0.95      0.97      0.96        40

    accuracy                           0.82       160
   macro avg       0.82      0.83      0.82       160
weighted avg       0.82      0.82      0.82       160



In [27]:
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
import joblib

news_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), stop_words="english")),
    ("clf", LogisticRegression(random_state=42, max_iter=1000))
])

news_pipeline.fit(train_df["text"], train_df["category"])

joblib.dump(news_pipeline, "news_classifier.pkl")


['news_classifier.pkl']

In [21]:
new_article = """
Nigerian President addresses the nation on upcoming elections and political reforms.
"""
predicted_category = news_pipeline.predict([new_article])[0]
print("Predicted category:", predicted_category)

Predicted category: Politics


In [29]:
import pandas as pd

# Load dataset
df = pd.read_csv("nigerian_news_scrapedv2.csv")

# Take a random sample (5–20 rows, configurable)
sample_df = df.sample(n=10, random_state=42)  # change n as needed

# Pick the column with text (e.g., "title" or "text")
X_new = sample_df["text"].fillna("").tolist()

pipeline = joblib.load("news_classifier.pkl")
# Predict categories using your trained pipeline
predictions = pipeline.predict(X_new)

# Build results DataFrame
results = pd.DataFrame({
    "text": X_new,
    "label": predictions
})

print(results)

# Save to CSV
results.to_csv("classified_sample.csv", index=False)
print("✅ Saved classified_sample.csv")


                                                text     label
0  For over two decades, non-state actors have wr...  Politics
1  The coalition-backed African Democratic Congre...  Politics
2  I have been a prosecutor for a while now and I...    Health
3  The News You Need, Delivered To You.\n\nSubscr...    Health
4  \n\nAryna Sabalenka sealed back-to-back titles...    Sports
5  Haruna Usman\n\nThe Global Affairs Canada Init...    Health
6  Manchester United Women were left scrambling j...    Sports
7  Kaduna State Governor Uba Sani has pledged to ...  Politics
8  The Minister of State for Agriculture and Food...  Politics
9     Discover more insights and updates on National  Politics
✅ Saved classified_sample.csv
