<a href="https://colab.research.google.com/github/Srivinay2005/Text-Summarization-Geo-Tagged-QA-Pipeline/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install dependencies
!pip install transformers torch spacy geopy pandas tqdm rouge-score
!python -m spacy download en_core_web_sm

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# STEP 2: Import libraries
import pandas as pd
from transformers import pipeline
import spacy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import json

In [2]:
# STEP 3: Load dataset (upload train.csv from Kaggle)
from google.colab import files
df = pd.read_csv("/content/drive/MyDrive/train.csv")
df = df.head(5000).reset_index(drop=True)
df.head(2)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


In [3]:
import torch
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

CUDA Available: True
GPU Name: Tesla T4


In [9]:
# STEP 4: Summarization pipeline (DistilBART, GPU)
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",  # smaller & faster summarizer
    device=0  # GPU mode
)

auto_summaries = []

# Summarize with checkpoint saving
for i, text in enumerate(tqdm(df["article"].astype(str).tolist(), desc="Summarizing")):
    if not text.strip():
        auto_summaries.append("")
        continue

    try:
        summary = summarizer(
            text,
            max_length=130,
            min_length=30,
            truncation=True
        )[0]['summary_text']
    except Exception as e:
        print(f"Skipped row {i} due to:", e)
        summary = ""

    auto_summaries.append(summary)

    # Save progress every 500 rows
    if (i + 1) % 500 == 0:
        df.loc[:i, "auto_summary"] = auto_summaries
        df[:i+1].to_csv(f"/content/drive/MyDrive/train_summarized_checkpoint_{i+1}.csv", index=False)
        print(f"Checkpoint saved at {i+1} records")

# Final save
df["auto_summary"] = auto_summaries
df.to_csv("/content/drive/MyDrive/train_summarized_2000.csv", index=False)
print(" Final file saved: train_summarized_2000.csv")


Device set to use cuda:0
Summarizing:  10%|█         | 500/5000 [08:12<1:13:56,  1.01it/s]

Checkpoint saved at 500 records


Summarizing:  14%|█▍        | 717/5000 [11:45<1:04:52,  1.10it/s]Your max_length is set to 130, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Summarizing:  20%|██        | 1000/5000 [16:08<54:15,  1.23it/s]

Checkpoint saved at 1000 records


Summarizing:  30%|███       | 1500/5000 [24:14<1:08:32,  1.18s/it]

Checkpoint saved at 1500 records


Summarizing:  38%|███▊      | 1875/5000 [30:27<45:07,  1.15it/s]Your max_length is set to 130, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Summarizing:  40%|████      | 2000/5000 [32:27<50:52,  1.02s/it]

Checkpoint saved at 2000 records


Summarizing:  45%|████▍     | 2234/5000 [36:12<40:23,  1.14it/s]Your max_length is set to 130, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Summarizing:  50%|█████     | 2500/5000 [40:23<40:49,  1.02it/s]

Checkpoint saved at 2500 records


Summarizing:  60%|██████    | 3000/5000 [48:22<36:08,  1.08s/it]

Checkpoint saved at 3000 records


Summarizing:  70%|███████   | 3500/5000 [56:20<29:18,  1.17s/it]

Checkpoint saved at 3500 records


Summarizing:  80%|███████▉  | 3977/5000 [1:04:09<17:18,  1.02s/it]Your max_length is set to 130, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Summarizing:  80%|████████  | 4000/5000 [1:04:34<20:21,  1.22s/it]

Checkpoint saved at 4000 records


Summarizing:  86%|████████▌ | 4282/5000 [1:09:05<12:30,  1.04s/it]Your max_length is set to 130, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Summarizing:  86%|████████▌ | 4306/5000 [1:09:28<10:35,  1.09it/s]Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Summarizing:  89%|████████▉ | 4441/5000 [1:11:42<10:37,  1.14s/it]Your max_length is set to 130, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Summarizing:  90%|█████████ | 4500/5000 [1:12:38<08:1

Checkpoint saved at 4500 records


Summarizing:  91%|█████████▏| 4573/5000 [1:13:47<05:27,  1.30it/s]Your max_length is set to 130, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Summarizing: 100%|██████████| 5000/5000 [1:20:33<00:00,  1.03it/s]

Checkpoint saved at 5000 records





✅ Final file saved: train_summarized_2000.csv


In [4]:
df = df.head(1000).reset_index(drop=True)

In [5]:
# STEP 5: Geo-extraction + Geocoding (optimized)
import spacy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
from time import sleep
import random

# Load SpaCy
nlp = spacy.load("en_core_web_sm")

# Setup Nominatim
geolocator = Nominatim(user_agent="geo_news", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Cache to avoid repeated lookups
geo_cache = {}

# Retry wrapper with exponential backoff
def safe_geocode(loc):
    if loc in geo_cache:
        return geo_cache[loc]
    for attempt in range(3):
        try:
            g = geocode(loc)
            geo_cache[loc] = g
            return g
        except Exception as e:
            print(f" Geocode failed for {loc}, attempt {attempt+1}: {e}")
            sleep(2 ** attempt + random.random())
    geo_cache[loc] = None
    return None

# Storage lists
geo_mentions = []
geo_coords = []
primary_lat = []
primary_lon = []

for i, text in enumerate(tqdm(df["article"].astype(str).tolist(), desc="Geoparsing")):
    doc = nlp(text)
    locs = [ent.text for ent in doc.ents if ent.label_ in ("GPE","LOC")]
    locs = list(dict.fromkeys(locs))  # deduplicate

    coords = []
    if locs:  # only geocode first location
        first_loc = locs[0]
        g = safe_geocode(first_loc)
        if g:
            coords = [[g.latitude, g.longitude]]

    geo_mentions.append(locs)
    geo_coords.append(coords)
    if coords:
        primary_lat.append(coords[0][0])
        primary_lon.append(coords[0][1])
    else:
        primary_lat.append(None)
        primary_lon.append(None)

    #  Checkpoint save
    if (i + 1) % 500 == 0:
        checkpoint_df = df.iloc[:i+1].copy()
        checkpoint_df["geo_mentions"] = geo_mentions
        checkpoint_df["geo_coordinates"] = geo_coords
        checkpoint_df["primary_lat"] = primary_lat
        checkpoint_df["primary_lon"] = primary_lon
        checkpoint_df.to_csv(f"/content/drive/MyDrive/final_output_checkpoint_{i+1}.csv", index=False)
        print(f" Checkpoint saved at {i+1} records")

# Final save
final_df = df.copy()
final_df["geo_mentions"] = geo_mentions
final_df["geo_coordinates"] = geo_coords
final_df["primary_lat"] = primary_lat
final_df["primary_lon"] = primary_lon
final_df.to_csv("/content/drive/MyDrive/final_output.csv", index=False)
print(" Final output saved at /content/drive/MyDrive/final_output.csv")

Geoparsing:  50%|█████     | 500/1000 [05:36<06:43,  1.24it/s]

✅ Checkpoint saved at 500 records


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/base.py", line 368, in _call_geocoder
    result = self.adapter.get_json(url, timeout=timeout, headers=req_headers)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 472, in get_json
    resp = self._request(url, timeout=timeout, headers=headers)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 500, in _request
    raise AdapterHTTPError(
geopy.adapters.AdapterHTTPError: Non-successful status code 503

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geop

✅ Checkpoint saved at 1000 records
🎉 Final output saved at /content/drive/MyDrive/final_output.csv





In [6]:
final_df.head(5)
final_df[["article", "geo_mentions", "geo_coordinates", "primary_lat", "primary_lon"]].sample(10)


Unnamed: 0,article,geo_mentions,geo_coordinates,primary_lat,primary_lon
917,(CNN) -- An Egyptian court Monday ordered a ba...,"[Washington, Egypt, U.S., New York, Cairo]","[[38.8950368, -77.0365427]]",38.895037,-77.036543
858,By . Daily Mail Reporter . David Cameron has i...,"[Woolwich, London, UK, Rigby, Birmingham, Sout...","[[51.4826696, 0.0623335]]",51.48267,0.062334
270,"(CNN) -- These days, no fashion house portfoli...","[Miami, Lagerfeld, Macau, Milan]","[[25.7741728, -80.19362]]",25.774173,-80.19362
572,"By . Becky Evans . PUBLISHED: . 11:08 EST, 5 D...","[Edinburgh, Mortonhall]","[[55.9533456, -3.1883749]]",55.953346,-3.188375
984,Explorers have recovered Apollo 11 engines fro...,"[the Atlantic Ocean, Florida, Earth, the Pacif...","[[39.2796505, -74.567832]]",39.279651,-74.567832
120,(CNN) -- Tour de France race director Christia...,"[Porto-Vecchio, Calvi, Ajaccio, Liege]","[[41.5911382, 9.2794469]]",41.591138,9.279447
504,(CNN) -- Actor Michael Jai White thinks the ne...,"[Hollywood, Washington]","[[34.0980031, -118.329523]]",34.098003,-118.329523
731,It smashed all book-selling records at the sta...,[],[],,
396,"By . Gerri Peev . PUBLISHED: . 17:47 EST, 24 N...","[England, Scotland, London, West Yorkshire, Du...","[[52.5310214, -1.2649062]]",52.531021,-1.264906
401,A teenager has been suspended from school afte...,[],[],,


In [None]:
!pip install rouge-score bert-score

In [24]:
import pandas as pd
geo_df = pd.read_csv("/content/drive/MyDrive/final_output.csv")
summ_df = pd.read_csv("/content/drive/MyDrive/train_summarized_2000.csv")
geo_df = geo_df.head(1000)
summ_df = summ_df.head(1000)
merged_df = pd.merge(geo_df, summ_df, on="article", how="inner")
print(merged_df.columns)
print(merged_df.head(2))
merged_df.to_csv("/content/drive/MyDrive/final_merged_1000.csv", index=False)
print("Merged CSV with first 1000 records saved!")


Index(['id_x', 'article', 'highlights_x', 'geo_mentions', 'geo_coordinates',
       'primary_lat', 'primary_lon', 'id_y', 'highlights_y', 'auto_summary'],
      dtype='object')
                                       id_x  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   

                                        highlights_x  \
0  Bishop John Folda, of North Dakota, is taking ...   
1  Criminal complaint: Cop used his role to help ...   

                                        geo_mentions  \
0  ['North Dakota', 'Fargo', 'Grand Forks', 'Italy']   
1  ['U.S.', 'New Jersey', 'Miami', 'the Dominican...   

               geo_coordinates  primary_lat  primary_lon  \
0  [[47.6201461, -100.540737]]    47.620146  -100.540737   
1  [[39.7837304, -100.445882]]    39.783730  -100.4458

In [27]:
# Drop unnecessary columns
merged_df = merged_df.drop(columns=["id_y", "highlights_y"])
merged_df = merged_df.rename(columns={
    "id_x": "id",
    "highlights_x": "highlights"
})
print(merged_df.columns)
print(merged_df.head(2))

merged_df.to_csv("/content/drive/MyDrive/Geo-Tagged_QA_Pipeline.csv", index=False)
print("CSV saved without id_y and highlights_y")


Index(['id', 'article', 'highlights', 'geo_mentions', 'geo_coordinates',
       'primary_lat', 'primary_lon', 'auto_summary'],
      dtype='object')
                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   

                                          highlights  \
0  Bishop John Folda, of North Dakota, is taking ...   
1  Criminal complaint: Cop used his role to help ...   

                                        geo_mentions  \
0  ['North Dakota', 'Fargo', 'Grand Forks', 'Italy']   
1  ['U.S.', 'New Jersey', 'Miami', 'the Dominican...   

               geo_coordinates  primary_lat  primary_lon  \
0  [[47.6201461, -100.540737]]    47.620146  -100.540737   
1  [[39.7837304, -100.445882]]    39.783730  -100.445882   

                     

In [29]:
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

# Columns
reference_col = 'highlights'
generated_col = 'auto_summary'

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_list = []
rouge2_list = []
rougeL_list = []

for ref, pred in tqdm(zip(merged_df[reference_col], merged_df[generated_col]), total=len(merged_df), desc="ROUGE Evaluation"):
    ref, pred = str(ref), str(pred)
    scores = scorer.score(ref, pred)
    rouge1_list.append(scores['rouge1'].fmeasure)
    rouge2_list.append(scores['rouge2'].fmeasure)
    rougeL_list.append(scores['rougeL'].fmeasure)

# Add to DataFrame
merged_df['rouge1'] = rouge1_list
merged_df['rouge2'] = rouge2_list
merged_df['rougeL'] = rougeL_list

P, R, F1 = score(
    cands=list(merged_df[generated_col]),
    refs=list(merged_df[reference_col]),
    lang='en',
    model_type='distilbert-base-uncased',
    batch_size=64
)

merged_df['bert_f1'] = F1.tolist()

merged_df.to_csv("/content/drive/MyDrive/Output_With_metrics.csv", index=False)
print(" CSV saved with ROUGE & BERTScore metrics")


ROUGE Evaluation: 100%|██████████| 1000/1000 [00:07<00:00, 127.88it/s]


 CSV saved with ROUGE & BERTScore metrics


In [30]:
# Compute average scores
avg_rouge1 = merged_df['rouge1'].mean()
avg_rouge2 = merged_df['rouge2'].mean()
avg_rougeL = merged_df['rougeL'].mean()
avg_bert   = merged_df['bert_f1'].mean()

# Print nicely
print("Average Evaluation Scores (1000 samples)")
print(f"ROUGE-1 : {avg_rouge1:.4f}")
print(f"ROUGE-2 : {avg_rouge2:.4f}")
print(f"ROUGE-L : {avg_rougeL:.4f}")
print(f"BERTScore F1 : {avg_bert:.4f}")


Average Evaluation Scores (1000 samples)
ROUGE-1 : 0.4289
ROUGE-2 : 0.2087
ROUGE-L : 0.3015
BERTScore F1 : 0.8247
