In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Cell 1 — Setup

In [13]:
import sys
from pathlib import Path
import pandas as pd

# Make sure project root is on sys.path
PROJECT_ROOT = Path().resolve().parents[0]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data.load import load_yaml
from src.models.classifier import load_classifier
from src.models.summarizer import build_summarizer
from src.models.evaluation import evaluate_summaries

Cell 2 — Load model + summarizer

In [17]:
cfg = load_yaml(PROJECT_ROOT / 'configs' / 'base.yaml')

clf = load_classifier(cfg, root=PROJECT_ROOT)
summarizer = build_summarizer(cfg)

print('Loaded model + summarizer')
cfg

Device set to use cpu


Loaded model + summarizer


{'project': {'name': 'news_improvement', 'random_seed': 42},
 'data': {'raw_dir': 'data/raw',
  'processed_dir': 'data/processed',
  'interim_dir': 'data/interim',
  'sources': {'pakistan_today': {'filename': 'pakistan_today(full-data).csv',
    'encoding': 'utf-8'},
   'tribune': {'filename': 'tribune(full-data).csv', 'encoding': 'latin1'},
   'dawn': {'filename': 'dawn (full-data).csv', 'encoding': 'latin1'},
   'daily_times': {'filename': 'daily_times(full-data).csv',
    'encoding': 'utf-8'},
   'business_reorder': {'filename': 'business_recorder(2020-2023).csv',
    'encoding': 'latin1'}},
  'use_sample': True,
  'sample': {'per_source': 10000},
  'large_file': {'business_reorder_chunksize': 20000}},
 'categories': {'mapping_file': 'configs/category_mapping_v1.yaml',
  'unknown_to': 'Other',
  'min_samples_per_class': 1000},
 'preprocessing': {'text_cleaning': {'lower': True,
   'remove_punctuation': True,
   'remove_numbers': True,
   'remove_stopwords': True,
   'lemmatize': Fal

Cell 3 — User input (headline + description)

In [15]:
headline = 'Pakistan faces rising inflation amid global uncertainty'
description = """
Pakistan’s inflation rate continued to rise on Tuesday, driven by higher fuel costs
and imported commodity prices. Analysts said the pressure may persist as global markets
remain volatile. Government officials announced new measures aimed at stabilizing prices
and improving supply chains, but critics questioned whether the steps will be enough.
"""

text = (headline + ' ' + description).strip()
pred = clf.predict([text])[0]
pred


'Pakistan'

Cell 4 — “Improved version” (current summarizer output)

In [16]:
improved = summarizer.summarize(description, category=pred)
improved


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 160, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


'Pakistan’s inflation rate continued to rise on Tuesday, driven by higher fuel costs. Analysts said the pressure may persist as global markets remain volatile. Government officials announced new measures aimed at stabilizing prices and improving supply chains. But critics questioned whether the steps will be enough. The inflation rate is expected to remain high for the foreseeable future.'

Cell 5 — Readability comparison (this looks great in report)

In [18]:
metrics = evaluate_summaries(
    originals=[description],
    summaries=[improved],
    use_rouge=False,
    compute_readability=True,
)

metrics


{'num_samples': 1, 'avg_length_ratio': 1.16}

Cell 6 — Batch demo on 30 random samples (save outputs)

In [19]:
from src.data.load import load_all_sources, add_broad_category

df = load_all_sources(cfg, root=PROJECT_ROOT)
df = add_broad_category(df, cfg, root=PROJECT_ROOT)

sample = df.sample(30, random_state=cfg['project']['random_seed']).reset_index(drop=True)

rows = []
for r in sample.to_dict('records'):
    h = r.get('headline', '') or ''
    d = r.get('description', '') or ''

    # Hard cap by characters to reduce extreme cases (optional)
    d = d[:4000]

    txt = (h + ' ' + d).strip()
    cat = clf.predict([txt])[0]

    try:
        out = summarizer.summarize(d or h, category=cat)
    except Exception as exc:
        out = (d or h)[:600]  # fallback
        print('Summarizer failed for one row:', exc)

    rows.append({
        'headline': h,
        'true_cat': r.get('broad_category', ''),
        'pred_cat': cat,
        'original': (d or h)[:600],
        'improved': (out or '')[:600],
    })

out_df = pd.DataFrame(rows)
out_df.head(5)



INFO:src.data.load:Loading source pakistan_today from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/pakistan_today(full-data).csv (encoding=utf-8)
INFO:src.data.load:Loading source tribune from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/tribune(full-data).csv (encoding=latin1)
INFO:src.data.load:Loading source dawn from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/dawn (full-data).csv (encoding=latin1)
INFO:src.data.load:Loading source daily_times from /home/spark/NUST/Semester 5/Data Mining/Project/data/raw/daily_times(full-data).csv (encoding=utf-8)
INFO:src.data.load:Loading preprocessed business_reorder from /home/spark/NUST/Semester 5/Data Mining/Project/data/interim/business_reorder_clean.parquet
INFO:src.data.load:Filtered invalid sources: (625905, 7) -> (624642, 7)
INFO:src.data.load:Combined dataset shape: (624642, 7)
INFO:src.data.load:Sampling up to 10000 rows per source (__file__ column).
  .apply(lambda g: g.sample(min(len(g), per_sourc

Unnamed: 0,headline,true_cat,pred_cat,original,improved
0,Private firms setting up LNG terminals seek wa...,Business,Business,Have sought guarantees and exemption from regu...,The private sector is engaged in setting up li...
1,President Alvi tests positive for Covid-19 for...,Pakistan,Pakistan,President Arif Alvi said on Thursday that he h...,President Arif Alvi says he has tested positiv...
2,OPEC and allies likely to cut production if US...,Business,Business,"The group of oil producers known as OPEC+, inc...","OPEC and allies, including Russia, want others..."
3,Pacific leaders struggle to keep focus on clim...,World,World,US-China rivalry and an inter-island rift over...,Pacific Islands Forum is the region’s most imp...
4,Police register fraud case against 2-year-old ...,Pakistan,Pakistan,KARACHI: In an odd attempt the Karachi Police ...,Karachi Police registered a fraud case against...


Cell 7 — Save for report appendix

In [20]:
out_dir = PROJECT_ROOT / 'experiments' / 'results'
out_dir.mkdir(parents=True, exist_ok=True)

out_df.to_csv(out_dir / 'rewrite_demo_samples.csv', index=False)
print('Saved:', out_dir / 'rewrite_demo_samples.csv')


Saved: /home/spark/NUST/Semester 5/Data Mining/Project/experiments/results/rewrite_demo_samples.csv
