In [1]:
%load_ext autoreload
%autoreload 2

Cell 1 — Setup

In [2]:
import sys
from pathlib import Path
import pandas as pd

# Robust project root resolver
def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / 'configs' / 'base.yaml').exists() and (p / 'src').exists():
            return p
    raise FileNotFoundError("Could not locate project root")

PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data.load import load_yaml
from src.models.classifier import load_classifier
from src.models.summarizer import build_summarizer
from src.models.evaluation import evaluate_summaries

RESULTS_DIR = PROJECT_ROOT / 'experiments' / 'results'

2025-12-18 21:52:13.487662: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-18 21:52:13.557902: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-18 21:52:15.110623: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Cell 2 — Load model + summarizer

In [3]:
with open(RESULTS_DIR / 'best_model_run.txt', 'r') as f:
    BEST_EXP_ID = f.readline().split(': ')[1].strip()

print(f"Using Best Classifier from Experiment: {BEST_EXP_ID}")

# 2. Load the corresponding configuration and model
cfg_path = PROJECT_ROOT / 'configs' / f"{BEST_EXP_ID}.yaml"
cfg = load_yaml(cfg_path)
clf = load_classifier(cfg, root=PROJECT_ROOT)

summarizer = build_summarizer(cfg)

clf

Using Best Classifier from Experiment: tfidf_svm_bigrams


Device set to use cpu


0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,functools.par...tize': False})
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


Cell 3 — User input (headline + description)

In [4]:
headline = 'Pakistan faces rising inflation amid global uncertainty'
description = """
Pakistan’s inflation rate continued to rise on Tuesday, driven by higher fuel costs
and imported commodity prices. Analysts said the pressure may persist as global markets
remain volatile. Government officials announced new measures aimed at stabilizing prices
and improving supply chains, but critics questioned whether the steps will be enough.
"""

text = (headline + ' ' + description).strip()
pred = clf.predict([text])[0]
pred


'Business'

Cell 4 — “Improved version” (current summarizer output)

In [5]:
improved = summarizer.summarize(description, category=pred)
improved


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 160, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


'Pakistan’s inflation rate continued to rise on Tuesday, driven by higher fuel costs. Analysts said the pressure may persist as global markets remain volatile. Government officials announced new measures aimed at stabilizing prices and improving supply chains. But critics questioned whether the steps will be enough. The inflation rate is expected to remain high for the foreseeable future.'

Cell 5 — Readability comparison (this looks great in report)

In [6]:
metrics = evaluate_summaries(
    originals=[description],
    summaries=[improved],
    use_rouge=False,
    compute_readability=True,
)

metrics


{'num_samples': 1, 'avg_length_ratio': 1.16}

Cell 6 — Batch demo on 30 random samples (save outputs)

In [7]:
from src.data.load import load_all_sources, add_broad_category

df = load_all_sources(cfg, root=PROJECT_ROOT)
df = add_broad_category(df, cfg, root=PROJECT_ROOT)

sample = df.sample(30, random_state=cfg['project']['random_seed']).reset_index(drop=True)

rows = []
for r in sample.to_dict('records'):
    h = r.get('headline', '') or ''
    d = r.get('description', '') or ''

    # Hard cap by characters to reduce extreme cases (optional)
    d = d[:4000]

    txt = (h + ' ' + d).strip()
    cat = clf.predict([txt])[0]

    try:
        out = summarizer.summarize(d or h, category=cat)
    except Exception as exc:
        out = (d or h)[:600]  # fallback
        print('Summarizer failed for one row:', exc)

    rows.append({
        'headline': h,
        'true_cat': r.get('broad_category', ''),
        'pred_cat': cat,
        'original': (d or h)[:600],
        'improved': (out or '')[:600],
    })

out_df = pd.DataFrame(rows)
out_df.head(5)



INFO:src.data.load:Loading source pakistan_today from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/pakistan_today(full-data).csv (encoding=utf-8)
INFO:src.data.load:Loading source tribune from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/tribune(full-data).csv (encoding=latin1)
INFO:src.data.load:Loading source dawn from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/dawn (full-data).csv (encoding=latin1)
INFO:src.data.load:Loading source daily_times from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/daily_times(full-data).csv (encoding=utf-8)
INFO:src.data.load:Loading preprocessed business_reorder from /home/spark/NUST/Semester 5/Data Mining/Project/data/interim/business_reorder_clean.parquet
INFO:src.data.load:Filtered invalid sources: (625905, 7) -> (624642, 7)
INFO:src.data.load:Combined dataset shape: (624642, 7)
INFO:src.data.load:Sampling up to 10000 rows per source (__file__ column).
  .apply(lambda g: g.sample(min(len(g), per_sourc

Unnamed: 0,headline,true_cat,pred_cat,original,improved
0,Private firms setting up LNG terminals seek wa...,Business,Business,Have sought guarantees and exemption from regu...,The private sector is engaged in setting up li...
1,President Alvi tests positive for Covid-19 for...,Pakistan,Pakistan,President Arif Alvi said on Thursday that he h...,President Arif Alvi says he has tested positiv...
2,OPEC and allies likely to cut production if US...,Business,Business,"The group of oil producers known as OPEC+, inc...","OPEC and allies, including Russia, want others..."
3,Pacific leaders struggle to keep focus on clim...,World,World,US-China rivalry and an inter-island rift over...,Pacific Islands Forum is the region’s most imp...
4,Police register fraud case against 2-year-old ...,Pakistan,Pakistan,KARACHI: In an odd attempt the Karachi Police ...,Karachi Police registered a fraud case against...


Cell 7 — Save for report appendix

In [9]:
out_dir = PROJECT_ROOT / 'experiments' / 'results'
out_dir.mkdir(parents=True, exist_ok=True)

out_df.to_csv(out_dir / 'rewrite_demo_samples.csv', index=False)
print('Saved:', out_dir / 'rewrite_demo_samples.csv')


Saved: /home/spark/NUST/Semester 5/Data Mining/Project/experiments/results/rewrite_demo_samples.csv
