In [None]:
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
import time
from urllib.parse import quote, urlparse
import pandas as pd
from random import choice, randint, sample
import concurrent.futures as cf
import os

In [None]:
MAX_WORKERS = 10
BATCH_SIZE = 100

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 9; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.93 Mobile Safari/537.36"
]

PROXIES = []
#A trial version of proxyscrape was used, providing 100 proxies saved in the below text file
with open('proxyscrape_premium_http_proxies.txt', 'r') as f:
    for line in f:
        PROXIES.append(line.strip())

# print(PROXIES)

In [None]:
def get_decoding_params(gn_art_id, proxy_list, max_retries=5):
    url = f"https://news.google.com/articles/{gn_art_id}"

    # Create a new session for each thread
    with requests.Session() as session:
        for attempt in range(max_retries):
            headers = {"User-Agent": choice(USER_AGENTS)}
            proxy_addr = choice(proxy_list)
            proxies = {"http": proxy_addr, "https": proxy_addr}

            try:
                response = session.get(url, headers=headers, proxies=proxies, timeout=10)
                if response.status_code == 429:
                    retry_after = int(response.headers.get("Retry-After", randint(30, 60)))
                    time.sleep(retry_after)
                    continue
                response.raise_for_status()

                soup = BeautifulSoup(response.text, "lxml")
                div = soup.select_one("c-wiz > div")
                return {
                    "signature": div.get("data-n-a-sg"),
                    "timestamp": div.get("data-n-a-ts"),
                    "gn_art_id": gn_art_id,
                }

            except Exception as e:
                print(f"Attempt {attempt+1} failed: {e}")
                time.sleep(randint(2, 5))

        raise Exception(f"Failed after {max_retries} retries")


def decode_urls(art, proxies=None):
    articles_reqs = [
        [
            "Fbv4je",
            f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
        ]
    ]

    payload = f"f.req={quote(json.dumps([articles_reqs]))}"
    headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
    response = requests.post(
        url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
        headers=headers,
        data=payload,
        proxies=proxies,
    )

    response.raise_for_status()

    loaded = json.loads(response.text.split("\n\n")[1])
    for res in loaded:
        if (
            isinstance(res, list)
            and len(res) > 2
            and isinstance(res[2], str)
            and res[2].startswith("[")):

            try:
                decoded = json.loads(res[2])
                if isinstance(decoded, list) and len(decoded) > 1:
                    return decoded[1]
            except Exception as e:
                print(f"Error decoding res[2]: {e}")

        return None

In [None]:
# df = pd.read_csv("articles.csv")
df = pd.read_csv("articles (3).csv")

# df = df[:200]
df['source'].value_counts()

source
Times of India        834
The Indian Express    612
The Hindu             489
India Today           466
Hindustan Times       337
The Economic Times    273
NDTV                  258
ThePrint               99
Tribune India          78
Moneycontrol           72
Firstpost              71
Scroll.in              63
OpIndia                59
Jagran Josh            51
News18                 46
Times Now              45
DD News                33
Name: count, dtype: int64

In [None]:
#Extracted Google News RSS links from the summary column
df['news_url'] = df['summary'].apply(
    lambda summary: re.search(r'href="([^"]+)"', summary).group(1) if re.search(r'href="([^"]+)"', summary) else None
)

df.drop(columns=["summary", "Unnamed: 0"], inplace=True)

encoded_urls = df['news_url'].tolist()

In [None]:
def fetch_single_url(url, proxy_list):
    gn_art_id = urlparse(url).path.split("/")[-1]
    return get_decoding_params(gn_art_id, proxy_list)


# Processed 10 URLs at once, each using a unique proxy
def process_in_parallel(urls, proxy_list, max_workers=10):
    with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Used a dict to map futures to their original index
        future_to_index = {executor.submit(fetch_single_url, url, proxy_list): idx for idx, url in enumerate(urls)}

        results = [None] * len(urls)
        for future in cf.as_completed(future_to_index):
            idx = future_to_index[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                print(f"Error processing URL at index {idx}: {e}")
    return results

In [None]:
# get_decoding_params(urlparse(encoded_urls[0]).path.split("/")[-1], PROXIES)

In [None]:
# 2. Decode the Google News URLs to get the final article URLs
# articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls]

articles_params = process_in_parallel(encoded_urls, PROXIES, max_workers=MAX_WORKERS)


Attempt 1 failed: HTTPSConnectionPool(host='news.google.com', port=443): Max retries exceeded with url: /articles/CBMijAJBVV95cUxQdkNRb0dpSkZveUpmX1QyR29OTHRwbkJWNWNQcWNTdTVoS1djWV9wU09YZEk2T19HYnZTWFVfRzJld0tjS0ZLb1dFZlBEME4ybmtaNEp6TXdWOHZMSldpdmJBak9tV3I5aG93Z0Y2WXpIV1ctdmZoY2djeWM1YzVuT0JlRl9RbG9Ga0hYdnVQdTFRN1VaalpKbnNSOGd2OTZkZkpEWlZoY214ZUlURmZMZjV6UzY3NnRMbmZENUZ1Q2tjNXFKSkJGSnBvOElHcm53OHU4R2RMNGQyVE9CV2tlV284NHN5a1g2aVp6TmJuY1NrXzV0Zkh0Sm9QWjN6bmhtZ3pRRmd2eGs3bW1K0gGSAkFVX3lxTFA2YzlFcUdiUmZvc093SHRjNUlxSjU4TVp2dVQ1Ym93U2ZrejFqTm5acHJrNUlJUkZEOExFRExtR0FqQ2Rhb1NMN3l3YTdEWjZCT1VBUURMT3VQUFRrNUdhbi1sYi10ZkZrMGcwQnV4VHVXSlYySHBSZFFSWmZWNzZTMDFYSlhzY1NJYVBzN2hOWmZxYkxTTEQ5UjMxUnMtY05OR2FTdVA1bkNKd2ZMS3lCYzlkTnFFMXY0SFJQem5BbEZsR0hDZ0QwVld6OV9IMEtiM0luY1pjamVOU0ZJM0ROcGQ2RlpxR1Y3UVNsNkktcGJVZVdHOXVuN2IwWUJQRXNzUHY0SU1CRGE0QUM5NG9qMnc (Caused by ProxyError('Unable to connect to proxy', RemoteDisconnected('Remote end closed connection without response')))


In [None]:
#Decoding the URLs
decoded_urls = []
for i, art in enumerate(articles_params):

    if (art["signature"] is None) :
        decoded_urls.append(None)
        print(f"Skipping article {i} due to None signature.")
        continue

    proxy = choice(PROXIES)
    proxies = {"http": proxy, "https": proxy}
    decoded_urls.append(decode_urls(art, proxies=proxies))

print(decoded_urls)

Skipping article 6 due to None signature.
Skipping article 11 due to None signature.
Skipping article 12 due to None signature.
Skipping article 17 due to None signature.
Skipping article 20 due to None signature.
Skipping article 25 due to None signature.
Skipping article 26 due to None signature.
Skipping article 31 due to None signature.
Skipping article 39 due to None signature.
Skipping article 43 due to None signature.
Skipping article 44 due to None signature.
Skipping article 47 due to None signature.
Skipping article 48 due to None signature.
Skipping article 49 due to None signature.
Skipping article 54 due to None signature.
Skipping article 55 due to None signature.
Skipping article 58 due to None signature.
Skipping article 59 due to None signature.
Skipping article 66 due to None signature.
Skipping article 69 due to None signature.
Skipping article 71 due to None signature.
Skipping article 72 due to None signature.
Skipping article 77 due to None signature.
Skipping art

In [None]:
# with open("decoded_urls.txt", "w", encoding="utf-8") as f:
#     for url in decoded_urls:
#         if (url is None) :
#             f.write("None\n")
#             continue
#         f.write(url + "\n")

In [None]:
decoded_urls = ["" if url == None else url for url in decoded_urls]

df["final_url"] = decoded_urls


In [None]:
df

Unnamed: 0,title,published,source,news_url,final_url
0,India Pakistan Live Updates: 23 minutes were e...,"Fri, 16 May 2025 07:30:21 GMT",The Economic Times,https://news.google.com/rss/articles/CBMi8AJBV...,https://m.economictimes.com/news/newsblogs/ind...
1,India Pakistan news LIVE: ‘It’s clear who want...,"Thu, 15 May 2025 17:22:23 GMT",Hindustan Times,https://news.google.com/rss/articles/CBMigwJBV...,https://www.hindustantimes.com/india-news/indi...
2,India Pakistan News Highlights: Jammu and Kash...,"Wed, 14 May 2025 00:35:29 GMT",Times of India,https://news.google.com/rss/articles/CBMimAJBV...,https://timesofindia.indiatimes.com/india/indi...
3,"Trump govt takes a step back, encourages direc...","Fri, 16 May 2025 01:42:02 GMT",The Economic Times,https://news.google.com/rss/articles/CBMi8wFBV...,https://m.economictimes.com/news/india/trump-g...
4,India Pakistan News Live: J&K locals start ret...,"Fri, 16 May 2025 05:49:38 GMT",Hindustan Times,https://news.google.com/rss/articles/CBMiiAJBV...,https://www.hindustantimes.com/india-news/indi...
...,...,...,...,...,...
3881,"""Every Soldier Takes Oath..."": General Who's F...","Wed, 11 May 2022 07:00:00 GMT",NDTV,https://news.google.com/rss/articles/CBMirAFBV...,https://www.ndtv.com/india-news/general-credit...
3882,Lahore High Court strikes down Pakistan’s ‘sed...,"Thu, 30 Mar 2023 07:00:00 GMT",The Indian Express,https://news.google.com/rss/articles/CBMiuwFBV...,https://indianexpress.com/article/explained/ex...
3883,Supreme Court’s verdict on sedition is a small...,"Fri, 13 May 2022 07:00:00 GMT",The Indian Express,https://news.google.com/rss/articles/CBMioAFBV...,https://indianexpress.com/article/opinion/colu...
3884,The reasons Law Commission gave while recommen...,"Sun, 04 Jun 2023 07:00:00 GMT",The Indian Express,https://news.google.com/rss/articles/CBMipwFBV...,


In [None]:
df = df[df["final_url"] != ""].reset_index(drop=True)
df['title'] = df['title'].str.replace(r'\s*-\s*[^-]+$', '', regex=True)

df['source'].value_counts()

if 'content' not in df.columns:
    df['content'] = ""


source
Times of India        586
The Indian Express    427
India Today           340
The Hindu             328
Hindustan Times       222
The Economic Times    199
NDTV                  175
ThePrint               75
Tribune India          49
Firstpost              46
Moneycontrol           46
Scroll.in              42
OpIndia                42
Jagran Josh            39
Times Now              35
News18                 29
DD News                25
Name: count, dtype: int64

In [None]:
df.to_csv("df.csv", index=False)

In [None]:
def get_details(url, publisher, old_title):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")

    chrome_options.add_argument(f'user-agent={choice(USER_AGENTS)}')
    chrome_options.add_argument(f'--proxy-server=http://{choice(PROXIES)}')

    session = webdriver.Chrome(options=chrome_options)
    try:
        session.get(url)
        time.sleep(3)  # Time for JS to load

        html = session.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Extract title
        page_title = soup.title.string.strip() if soup.title and soup.title.string else ""

        # Extract article body
        article_body = ""

        script_tags = soup.find_all("script", {"type": "application/ld+json"})
        for tag in script_tags:
            try:
                data = json.loads(tag.string)
                if isinstance(data, dict) and "articleBody" in data:
                    article_body = data["articleBody"]
                    break
            except Exception:
                continue    # Some script tags may not be JSON


        final_title = page_title if page_title.strip() else old_title
        final_content = article_body if article_body.strip() else ""

        return final_title, final_content

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return old_title, ""
    finally:
        session.quit()

In [None]:
df = pd.read_csv("df.csv")

if 'content' not in df.columns:
    df['content'] = ""


In [None]:
# get_details("https://www.indiatoday.in/india-today-insight/story/why-billionaire-george-soros-is-being-linked-to-sonia-rahul-and-modis-economic-advisor-2648700-2024-12-12", "Why billionaire George Soros is being linked to Sonia-Rahul and Modi's economic advisor", "India Today")
# get_details("https://indianexpress.com/article/political-pulse/populist-schemes-elections-pm-modi-congress-9653318/", "PM-Modi-led govt ended policy paralysis of UPA regime: Amit Shah", "The Indian Express")

In [None]:
def process_row(idx_row):
    idx, row = idx_row
    return idx, *get_details(row.final_url, row.source, row.title)


def save_progress(last_index):
    with open("progress.txt", "w") as f:
        f.write(str(last_index))

def load_progress():
    if os.path.exists("progress.txt"):
        with open("progress.txt", "r") as f:
            return int(f.read().strip())
    return 0

def process_batches(df, batch_size, max_workers, start_index=0):
    total_rows = len(df)
    for start in range(start_index, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch_df = df.iloc[start:end].copy()
        results = [None] * len(batch_df)

        #for parallel processing (concurrent connections) to fetch article data in less time
        with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(process_row, (i, row)): i for i, row in enumerate(batch_df.itertuples())}
            for future in cf.as_completed(futures):
                i = futures[future]
                idx, new_title, new_content = future.result()
                results[i] = (new_title, new_content)

        for i, (new_title, new_content) in enumerate(results):
            if new_title and new_title.strip():
                batch_df.iat[i, batch_df.columns.get_loc('title')] = new_title
            batch_df.iat[i, batch_df.columns.get_loc('content')] = new_content

        # Saved each batch to CSV after processing
        if start == 0 and (not os.path.exists("test_df.csv") or start_index == 0):
            batch_df.to_csv("test_df.csv", index=False, mode='w')
        else:
            batch_df.to_csv("test_df.csv", index=False, mode='a', header=False)

        save_progress(end)
        print(f"Processed and saved rows {start} to {end - 1}")

    print("All batches processed.")



In [None]:
last_index = load_progress()
if os.path.exists("progress.txt"):
    print(f"Resuming from index: {last_index}")

process_batches(df, BATCH_SIZE, MAX_WORKERS, start_index=last_index)

Resuming from index: 2400
Processed and saved rows 2400 to 2499
Processed and saved rows 2500 to 2599
Processed and saved rows 2600 to 2699
Processed and saved rows 2700 to 2704
All batches processed.


In [None]:
# df = df[:200]   #testing with 200 rows


# # Use ThreadPoolExecutor for parallel processing
# results = [None] * len(df)
# with cf.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
#     futures = [executor.submit(process_row, (i, row)) for i, row in df.iterrows()]
#     for future in cf.as_completed(futures):
#         idx, new_title, new_content = future.result()
#         results[idx] = (new_title, new_content)

# # Update DataFrame
# for i, (new_title, new_content) in enumerate(results):
#     if new_title.strip():
#         df.at[i, 'title'] = new_title
#     df.at[i, 'content'] = new_content

# # print(df)
# df.to_csv("test_df.csv", index=False)

Error fetching https://timesofindia.indiatimes.com/india/fake-news-alert-government-fact-checks-pakistani-propaganda-on-operation-sindoor/articleshow/120992221.cms: Alert Text: Failed to load website properly since html-load.com is blocked. Please allow html-load.com
Message: unexpected alert open: {Alert text : Failed to load website properly since html-load.com is blocked. Please allow html-load.com}
  (Session info: chrome=136.0.7103.114)
Stacktrace:
	GetHandleVerifier [0x00007FF7DEDBCF65+75717]
	GetHandleVerifier [0x00007FF7DEDBCFC0+75808]
	(No symbol) [0x00007FF7DEB88F9A]
	(No symbol) [0x00007FF7DEC2FF0C]
	(No symbol) [0x00007FF7DEC07153]
	(No symbol) [0x00007FF7DEBD0421]
	(No symbol) [0x00007FF7DEBD11B3]
	GetHandleVerifier [0x00007FF7DF0BD74D+3223469]
	GetHandleVerifier [0x00007FF7DF0B7CF2+3200338]
	GetHandleVerifier [0x00007FF7DF0D5B23+3322755]
	GetHandleVerifier [0x00007FF7DEDD6A3A+180890]
	GetHandleVerifier [0x00007FF7DEDDE13F+211359]
	GetHandleVerifier [0x00007FF7DEDC52B4+109

In [None]:
def remove_publisher_from_title(row):
    title = row['title']
    publisher = row['source']
    if pd.notnull(title) and pd.notnull(publisher) and publisher in title:
        return title.replace(publisher, '').strip(' -|:')
    return title


In [None]:
df = pd.read_csv("test_df.csv")
df['title'] = df.apply(remove_publisher_from_title, axis=1)
df.to_csv("test_df.csv", index=False)

In [None]:
df = pd.read_csv('df.csv')
test_df = pd.read_csv('test_df.csv')

# Processing titles

mask = test_df['title'].str.startswith('www.')
test_df.loc[mask, 'title'] = df.loc[mask, 'title']


mask = test_df['title'].str.endswith('.com')
test_df.loc[mask, 'title'] = df.loc[mask, 'title']


mask = df['title'].str.len() > test_df['title'].str.len()
test_df.loc[mask, 'title'] = df.loc[mask, 'title']

test_df.to_csv('df_1.csv', index=False)

In [None]:
df = pd.read_csv("df_1.csv")
df = df[df['content'].notna() & (df['content'] != "")].reset_index(drop=True)

print(len(df))
df["source"].value_counts()

1576


source
Times of India        446
The Indian Express    319
India Today           239
Hindustan Times       165
NDTV                  129
The Economic Times    101
ThePrint               55
Tribune India          39
Firstpost              35
Times Now              25
News18                 22
Moneycontrol            1
Name: count, dtype: int64

In [None]:
df.to_csv("final_df.csv", index=False)