# Mini AI Pipeline Notebook
This notebook performs AG News classification using baseline and AI pipeline.

In [8]:
# Cell 1: Setup & imports
import random
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

import re
import joblib, os, json
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED=42; 
random.seed(RANDOM_SEED); 
np.random.seed(RANDOM_SEED)

## Load and sample dataset

In [9]:
dataset = load_dataset('ag_news')

def sample_balanced(ds, n_per_class=250, seed=RANDOM_SEED):
    df = pd.DataFrame(ds)
    out=[]
    for label in sorted(df['label'].unique()):
        out.append(df[df['label']==label].sample(n=n_per_class, random_state=seed))
    return pd.concat(out).sample(frac=1, random_state=seed).reset_index(drop=True)

train_df = sample_balanced(dataset['train'], 250)
test_df = sample_balanced(dataset['test'], 100)

label_map = {0:'World',1:'Sports',2:'Business',3:'Sci/Tech'}
train_df['label_text'] = train_df['label'].map(label_map)
test_df['label_text'] = test_df['label'].map(label_map)

train_df.head()

Unnamed: 0,text,label,label_text
0,Coke is it: BHP coal to double BHP Billiton pl...,2,Business
1,BIG HIKE IN ENERGY BILLS The company says the ...,2,Business
2,Officials Blame Contractors in Tunnel Leak The...,2,Business
3,Before the Bell: Krispy Kreme Down 11 Pct Shar...,2,Business
4,Medical Examiner Finds No Injuries on Thanou-S...,1,Sports


## Baseline classifier

In [10]:
keywords = {
    'World': ['world','country','president','government','war','election'],
    'Sports': ['win','match','goal','player','season'],
    'Business': ['company','market','stocks','revenue','economy'],
    'Sci/Tech': ['technology','science','software','AI','robot','space']
}

def baseline_predict_one(t):
    t = t.lower()
    scores = {c:0 for c in keywords}
    for c, words in keywords.items():
        for w in words:
            if re.search(r'\b'+re.escape(w)+r'\b', t):
                scores[c] += 1
    m = max(scores.values())
    return 'World' if m==0 else sorted([c for c,v in scores.items() if v==m])[0]

test_df['pred_baseline'] = test_df['text'].apply(baseline_predict_one)
baseline_acc = (test_df['pred_baseline'] == test_df['label_text']).mean()
print('Baseline accuracy:', baseline_acc)

Baseline accuracy: 0.47


## AI Pipeline: Embedding + Logistic Regression

In [11]:
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
X_train = embedder.encode(train_df['text'].tolist())
X_test = embedder.encode(test_df['text'].tolist())
y_train = train_df['label'].values
y_test = test_df['label'].values

clf = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='saga', random_state=RANDOM_SEED)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
ai_acc = accuracy_score(y_test, y_pred)
print('AI Pipeline accuracy:', ai_acc)

AI Pipeline accuracy: 0.88




## Compute Precision, Recall, F1

In [12]:
precision_base, recall_base, f1_base, _ = precision_recall_fscore_support(
    test_df['label_text'], test_df['pred_baseline'], average='macro'
)

precision_ai, recall_ai, f1_ai, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro'
)

## Display results table

In [13]:
print(f"Method\t\tAccuracy\tPrecision\tRecall\tF1")
print(f"Baseline\t{baseline_acc:.2f}\t\t{precision_base:.2f}\t\t{recall_base:.2f}\t{f1_base:.2f}")
print(f"AI Pipeline\t{ai_acc:.2f}\t\t{precision_ai:.2f}\t\t{recall_ai:.2f}\t{f1_ai:.2f}")

Method		Accuracy	Precision	Recall	F1
Baseline	0.47		0.73		0.47	0.45
AI Pipeline	0.88		0.88		0.88	0.88
