In [6]:
from datasets import load_dataset

# Load a small slice first to test
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train[:1%]")

print(ds[0])


  from .autonotebook import tqdm as notebook_tqdm


{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During

In [7]:
import json

fever_unified = []

with open("train.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        if item['label'] != "SUPPORTS":
            continue  # skip REFUTES / NOT ENOUGH INFO

        for eg_idx, evidence_group in enumerate(item['evidence']):
            for ev_idx, ev in enumerate(evidence_group):
                article_title = ev[2] if ev[2] is not None else "NA"
                passage_id = f"fever_{item['id']}_{eg_idx}_{ev_idx}"
                entry = {
                    "id": passage_id,
                    "url": f"https://en.wikipedia.org/wiki/{article_title}" if article_title != "NA" else "NA",
                    "title": article_title,
                    "text": item['claim']  # simplest: use claim as text; later can replace with actual evidence text if available
                }
                fever_unified.append(entry)

print(f"Total SUPPORTS entries: {len(fever_unified)}")


Total SUPPORTS entries: 193756


In [8]:
fever_unified

[{'id': 'fever_75397_0_0',
  'url': 'https://en.wikipedia.org/wiki/Nikolaj_Coster-Waldau',
  'title': 'Nikolaj_Coster-Waldau',
  'text': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'},
 {'id': 'fever_75397_0_1',
  'url': 'https://en.wikipedia.org/wiki/Fox_Broadcasting_Company',
  'title': 'Fox_Broadcasting_Company',
  'text': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'},
 {'id': 'fever_150448_0_0',
  'url': 'https://en.wikipedia.org/wiki/Roman_Atwood',
  'title': 'Roman_Atwood',
  'text': 'Roman Atwood is a content creator.'},
 {'id': 'fever_150448_1_0',
  'url': 'https://en.wikipedia.org/wiki/Roman_Atwood',
  'title': 'Roman_Atwood',
  'text': 'Roman Atwood is a content creator.'},
 {'id': 'fever_214861_0_0',
  'url': 'https://en.wikipedia.org/wiki/History_of_art',
  'title': 'History_of_art',
  'text': 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic

In [9]:
import pandas as pd
fev_ds = pd.DataFrame(fever_unified)

In [10]:
ds_wiki = ds.to_pandas()

In [11]:
ds_wiki.head()

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the fraction of sunlight that i...
2,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
3,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...
4,305,https://en.wikipedia.org/wiki/Achilles,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ..."


In [12]:
df_ret = pd.concat([ds_wiki, fev_ds])

In [13]:
df_ret

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the fraction of sunlight that i...
2,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
3,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...
4,305,https://en.wikipedia.org/wiki/Achilles,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ..."
...,...,...,...,...
193751,fever_13114_0_0,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.
193752,fever_13114_1_0,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.
193753,fever_152180_0_0,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.
193754,fever_152180_1_0,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.


In [14]:
df_ret_texts_list = df_ret['text'].to_list()

In [15]:
type(df_ret_texts_list)

list

In [16]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()
API_key = os.getenv("GOOGLE_FACT_CHECK_API")


url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

params = {
    'key': API_key,
    'query': 'covid vaccine', 
    'languageCode': 'en-US',
    'pageSize': 100
}

response = requests.get(url,params=params)
len(response.json()['claims'])

100

In [17]:
response.json()['claims'][0]
i=0
for claim in response.json()['claims']:
    # print(claim.get('claimReview')[0].get('title',''))
    print(claim
          )
    i+=1
    if i==4:
        break



{'text': 'The chair of the BMA told Dr Malhotra in December 2021 that most of his colleagues got their information on the Covid vaccine from the BBC.', 'claimReview': [{'publisher': {'name': 'Full Fact', 'site': 'fullfact.org'}, 'url': 'https://fullfact.org/health/aseem-malhotra-reform-conference-vaccines/', 'title': 'Covid vaccine claims at the Reform UK conference: fact checked ...', 'reviewDate': '2025-09-08T00:00:00Z', 'textualRating': 'We can’t say how accurate this claim is, as we have no way of directly verifying the exchange in 2021, but the chair of the BMA at the time has denied that he said this.', 'languageCode': 'en'}]}
{'text': 'A leading oncologist thinks it’s highly likely that the Covid vaccines have been a significant factor in the cancer of members of the Royal Family.', 'claimReview': [{'publisher': {'name': 'Full Fact', 'site': 'fullfact.org'}, 'url': 'https://fullfact.org/health/aseem-malhotra-reform-conference-vaccines/', 'title': 'Covid vaccine claims at the Ref

In [51]:
def fetch_google_facts(query,num_iter = 1, pages = 100):
    
    load_dotenv()
    API_key_google = os.getenv("GOOGLE_FACT_CHECK_API")


    url_google = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

    ds=[]
    next_page_token = None
    for _ in range(num_iter):

        params = {
        'key': API_key_google,
        'query': query, 
        'languageCode': 'en-US',
        'pageSize': pages
        }

        if next_page_token:
            params['pageToken'] = next_page_token

        try:
            response = requests.get(url_google,params=params)
            response.raise_for_status()

            for claim in response.json()['claims']:
                date = claim.get('claimReview',[])[0].get('reviewDate','').split('T')[0]
                ds.append(
                    {'title':claim.get('claimReview',[])[0].get('title',''),
                     'text':claim.get('text',''),
                     'url':claim.get('claimReview',[])[0].get('url',''), 
                     'Published_Date':date if date else "No date available",
                     'source':claim.get('claimReview',[])[0].get('publisher',{}).get('name',"No source available")
                     })
                
            next_page_token = response.json().get('nextPageToken')
            if not next_page_token:
                break

        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            break
        except Exception as e:
            print(f"Error processing response: {e}")
            break

    print(f"Google fetched {len(ds)} articles")
    return ds


In [19]:
ds1 = fetch_google_facts('indian government')
ds1,len(ds1)

([{'title': 'Did Govt Declare Sept 3 to 6, 2025, as Public Holidays? Fact Check – DigitEye India',
   'text': 'The Indian Government has declared September 3-6 2025 as public holidays.',
   'url': 'https://digiteye.in/did-govt-declare-sept-3-to-6-2025-as-public-holidays-fact-check/',
   'Published_Date': '2025-09-05',
   'source': 'DigitEye India'},
  {'title': 'Viral ₹1 Lakh Rupee Coin Is Not Issued by the Government of India',
   'text': 'Viral video shows a genuine one lakh rupee coin issued by the Indian government.',
   'url': 'https://factly.in/viral-%E2%82%B91-lakh-rupee-coin-is-not-issued-by-the-government-of-india/',
   'Published_Date': '2025-09-05',
   'source': 'FACTLY'},
  {'title': 'The Indian government has not declared 3, 4, 5, and 6 September 2025 as nationwide public holidays',
   'text': 'The Indian government has declared 3, 4, 5, and 6 September 2025 as nationwide public holidays.',
   'url': 'https://factly.in/the-indian-government-has-not-declared-3-4-5-and-6-sep

In [20]:
ds2 = fetch_google_facts('india')
len(ds2)

100

In [21]:
ds = ds1 + ds2
len(ds)

200

In [22]:
import pandas as pd
df = pd.DataFrame(ds)
df.head()

Unnamed: 0,title,text,url,Published_Date,source
0,"Did Govt Declare Sept 3 to 6, 2025, as Public ...",The Indian Government has declared September 3...,https://digiteye.in/did-govt-declare-sept-3-to...,2025-09-05,DigitEye India
1,Viral ₹1 Lakh Rupee Coin Is Not Issued by the ...,Viral video shows a genuine one lakh rupee coi...,https://factly.in/viral-%E2%82%B91-lakh-rupee-...,2025-09-05,FACTLY
2,"The Indian government has not declared 3, 4, 5...","The Indian government has declared 3, 4, 5, an...",https://factly.in/the-indian-government-has-no...,2025-09-01,FACTLY
3,Hotmail founder Sabeer Bhatia claims Govt is s...,While the cost of a Vande Bharat train is Rs 1...,https://digiteye.in/hotmail-founder-sabeer-bha...,2025-08-20,DigitEye India
4,Fact Check: E27 ethanol blend policy under dev...,The Indian government is planning to produce 2...,https://newsmeter.in/fact-check/e27-ethanol-bl...,2025-07-21,NewsMeter


In [23]:

url_news_api = "https://newsapi.org/v2/everything"

load_dotenv()
api_key_news = os.getenv('NEWS_API')

sort_by  =['relevancy', 'popularity', 'publishedAt']
params = {
    "apiKey":api_key_news,
    "q":'indian actors',
    "sortBY":sort_by[0],
    "pageSize":69

}

res = requests.get(url_news_api,params=params)
res.json().get('articles')

[{'source': {'id': None, 'name': 'The Indian Express'},
  'author': 'Entertainment Desk',
  'title': 'Actor, who worked with Amitabh Bachchan and Sridevi, was not at peace with fame and left the industry at the peak of his career',
  'description': 'Mangal Dhillon was one of the most prominent television actors of his time, known for iconic serials like Buniyaad, Junoon, and Noorjahan. At the peak of his career, he was also among the highest-paid actors on Indian television.',
  'url': 'https://indianexpress.com/article/entertainment/bollywood/actor-mangal-dhillon-was-not-at-peace-with-fame-and-left-the-industry-at-the-peak-of-his-career-10190788/',
  'urlToImage': 'https://images.indianexpress.com/2025/08/mangal-01.png',
  'publishedAt': '2025-08-15T08:53:32Z',
  'content': 'Whenever Mangal Dhillon appeared on screen, the first thing audiences noticed was his commanding baritone voice. A prominent figure in Indian television and film during the 1980s and 1990s, Mangal wo… [+4878 chars

In [24]:
arl=res.json().get('articles')
arl[0]
len(arl)
res.json()

{'status': 'ok',
 'totalResults': 259,
 'articles': [{'source': {'id': None, 'name': 'The Indian Express'},
   'author': 'Entertainment Desk',
   'title': 'Actor, who worked with Amitabh Bachchan and Sridevi, was not at peace with fame and left the industry at the peak of his career',
   'description': 'Mangal Dhillon was one of the most prominent television actors of his time, known for iconic serials like Buniyaad, Junoon, and Noorjahan. At the peak of his career, he was also among the highest-paid actors on Indian television.',
   'url': 'https://indianexpress.com/article/entertainment/bollywood/actor-mangal-dhillon-was-not-at-peace-with-fame-and-left-the-industry-at-the-peak-of-his-career-10190788/',
   'urlToImage': 'https://images.indianexpress.com/2025/08/mangal-01.png',
   'publishedAt': '2025-08-15T08:53:32Z',
   'content': 'Whenever Mangal Dhillon appeared on screen, the first thing audiences noticed was his commanding baritone voice. A prominent figure in Indian television a

In [46]:
def fetch_news_org(query: str, page_size: int = 100, num_iter: int = 12, sort_by_index: int = 0):
    url_news_api = "https://newsapi.org/v2/everything"

    load_dotenv()
    api_key_news = os.getenv("NEWS_API")
    if not api_key_news:
        raise ValueError("NEWS_API environment variable not set")

    sort_by = ["relevancy", "popularity", "publishedAt"]
    if sort_by_index < 0 or sort_by_index >= len(sort_by):
        sort_by_index = 0

    news_ds = []

    for page in range(1, num_iter + 1):
        params = {
            "apiKey": api_key_news,
            "q": query,
            "sortBy": sort_by[sort_by_index], 
            "pageSize": min(page_size, 100),
            "page": page,
        }

        try:
            res = requests.get(url_news_api, params=params, timeout=30)
            res.raise_for_status()

            if res.json().get("status") == "ok":
                print(f"status | {res.json()['status']}")

            data = res.json().get("articles")
            if not data:
                print(f"Could not find any article at page {page}")
                break

            for article in data:
                if not article.get("content"):
                    continue

                news_ds.append({
                    "title": article.get("title", ""),
                    "text": (article.get("content", "") or "") + (article.get("description", "") or ""),
                    "url": article.get("url", ""),
                    "source": article.get("source", {}).get("name", "No source available"),
                    "Published_Date": article.get("publishedAt", ""),
                })

            if len(news_ds) >= res.json().get("totalResults", 0):
                print(f"No More Results | reached {res.json().get('totalResults', 0)} Results")
                break

        except requests.exceptions.RequestException as e:
            print(f"Request error on page(iteration) {page}: {e}")
            break
        except Exception as e:
            print(f"Unexpected error on page(iter) {page}: {e}")
            break

    print(f"Fetched {len(news_ds)} news articles")
    return news_ds


In [26]:
from dotenv import load_dotenv
import os
news_db = fetch_news_org('conjuring',500,100,1)

status | ok
Could not find any article at page 2
Fetched 100 news articles


In [27]:
news_db[0]['text']

'<ul><li></li><li></li><li></li></ul>\r\nShowrunner wants to turn you into a happy little content prompter for the Netflix of AI\r\nFable founder Edward Saatchi aims to gamify Hollywoods pivot to AI one p… [+12829 chars]As one of the cofounders behind Oculus Story Studio, Edward Saatchi knows how hard it can be to sell people on new tech that bills itself as revolutionary. Even though Story Studio snagged an Emmy for one of its three animated features, a general lack of publ…'

In [28]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.ndtv.com/topics",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}



url="https://archives.ndtv.com/articles/2025-01.html"

response  = requests.get(url,headers=headers)
response.status_code

403

In [29]:

def get_ndtv_rss_feeds():
   
    rss_feeds = {
        'top_stories': 'https://feeds.feedburner.com/ndtvnews-top-stories',
        'india': 'https://feeds.feedburner.com/ndtvnews-india-news',
        'world': 'https://feeds.feedburner.com/ndtvnews-world-news',
        'sports': 'https://feeds.feedburner.com/ndtvnews-sports',
        'entertainment': 'https://feeds.feedburner.com/ndtvnews-entertainment',
        'business': 'https://feeds.feedburner.com/ndtvnews-business'
    }
    
    all_articles = []
    
    for category, url in rss_feeds.items():
        try:
            print(f"Fetching RSS feed: {category}")
            response = requests.get(url, timeout=10)
            
            if response.status_code != 200:
                print(f"Failed to fetch RSS feed: {response.status_code}")
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            items = soup.find_all('item')
            
            for item in items:
                title = item.find('title')
                link = item.find('link')
                description = item.find('description')
                pub_date = item.find('pubDate')
                
                if title and link:
                    all_articles.append({
                        'title': title.get_text().strip(),
                        'url': link.get_text().strip(),
                        'text': description.get_text().strip() if description else '',
                        'Published_Date': pub_date.get_text().strip() if pub_date else '',
                        'source': 'NDTV RSS'
                    })
            
            
            
        except Exception as e:
            print(f"Error processing RSS feed {category}: {e}")
    
    print(f"Found {len(all_articles)} articles from RSS feeds")
    return all_articles

# Get articles from RSS
rss_articles = get_ndtv_rss_feeds()
rss_df = pd.DataFrame(rss_articles)

Fetching RSS feed: top_stories
Fetching RSS feed: india
Fetching RSS feed: world
Fetching RSS feed: sports
Fetching RSS feed: entertainment
Fetching RSS feed: business
Found 60 articles from RSS feeds


In [30]:
rss_df.iloc[0]

title             Food Delivery Agent Crushed To Death By Speedi...
url                                                                
description       A 20-year-old food delivery agent was killed i...
Published_Date                                                     
category                                                top_stories
source                                                     NDTV RSS
Name: 0, dtype: object

In [33]:
xml_url = "https://ddnews.gov.in/all-news-archive/" 

res = requests.get(xml_url)

res
soup = BeautifulSoup(res.text , 'html.parser')
a=soup.find('div',class_= 'moreStoriesItem')
# url = a.a['href']
# a.img['alt']
# type(url)
# # res = requests.get(url)
# # res
# url
# soup.find_all('div')
a

<div class="moreStoriesItem">
<div class="postImageS">
<img alt="तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचनात्मक भूमिका को किया रेखांकित " class="img-fluid w-100" src="https://ddnews.gov.in/wp-content/uploads/2025/09/Coast-Guard-1200x678.png" title="तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचनात्मक भूमिका को किया रेखांकित ">
</img></div>
<div class="moreStoriesText">
<p class="mb-0 catDate mt-3">
								12/09/25 | टॉप स्टोरीज							</p>
<a class="text-decoration-none" href="https://ddnews.gov.in/india-underlines-its-constructive-role-in-the-global-conference-of-coast-guard/" title="तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचनात्मक भूमिका को किया रेखांकित ">
<h2 class="blogTitleS">तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचनात्मक भूमिका को किया रेखांकित </h2>
</a>
</div>
</div>

In [34]:
a.find('div',class_='moreStoriesText').text.strip().split('|')[0]

'12/09/25 '

In [35]:
soap = BeautifulSoup(res.text , 'html.parser')

In [36]:
def fetch_dd_news_ugly(max_stories = 10):
    dd_url = "https://ddnews.gov.in/all-news-archive/" 
    d=[]
    try:
        res = requests.get(dd_url)
        res.raise_for_status()

        soup = BeautifulSoup(res.text , 'html.parser')
        stories=soup.find_all('div',class_= 'moreStoriesItem')

        
        for idx,story in enumerate(stories):
            article_url  = story.a['href']
            title = story.img['alt']
            url_image  =story.img['src']
            publishedtime = story.find('div',class_='moreStoriesText').text.strip().split('|')[0]

            try:
                print(f'Fetching article {idx+1}')
                res_article = requests.get(article_url)
                res_article.raise_for_status()

                soap = BeautifulSoup(res_article.text , 'html.parser')
                paras = soap.find('div',class_ = 'entry-content').find_all('p')

                full_para = ""
                for para in paras:
                    full_para += para.text
            except requests.exceptions.RequestException as e:
                print(f'Could not fetch article {idx}')
            except Exception as e:
                print(e)
            
            d.append({
                'title':title,
                'text':full_para,
                'url':article_url,
                'Published_Date':publishedtime,
                'source':"DD  News"
                
                })
            
            if idx+1>=max_stories:
                 break
            
    except requests.exceptions.RequestException as e:
            print('Could not fetch the link',e)
    except Exception as e:
            print(e)

    print(f"Total Articles Fetched {len(d)}")

    return d

         


In [37]:
from urllib.parse import urljoin
import time

def fetch_dd_news_fixed(max_stories=10, delay=1):
    base_url = "https://ddnews.gov.in"
    dd_url = "https://ddnews.gov.in/all-news-archive/"
    articles = []

    try:
        res = requests.get(dd_url)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')
        stories = soup.find_all('div', class_='moreStoriesItem')

        for idx, story in enumerate(stories):
            if idx >= max_stories:
                break

            try:
                # Links and images (handle relative paths)
                article_url = urljoin(base_url, story.a['href']) if story.a else None
                title = story.img.get('alt', 'No title') if story.img else 'No title'
                url_image = urljoin(base_url, story.img['src']) if story.img and story.img.get('src') else ''

                publishedtime = (
                    story.find('div', class_='moreStoriesText').text.strip().split('|')[0]
                    if story.find('div', class_='moreStoriesText') else "Unknown date"
                )

                if not article_url:
                    continue

                # Fetch article page
                time.sleep(delay)  # be nice to server
                res_article = requests.get(article_url, timeout=15)
                res_article.raise_for_status()
                soap = BeautifulSoup(res_article.text, 'html.parser')

                # Extract content
                content_div = soap.find('div', class_='entry-content') or soap.find('div', class_='article-content')
                if content_div:
                    paras = content_div.find_all('p')
                    full_para = " ".join(p.get_text(strip=True) for p in paras)
                else:
                    full_para = soap.get_text(strip=True)

                articles.append({
                    'title': title,
                    'text': full_para,
                    'url': article_url,
                    'Published_Date': publishedtime,
                    'source': "DD News"
                })

                print(f"Fetched article {idx+1}: {title[:50]}...")

            except Exception as e:
                print(f"Error fetching article {idx+1}: {e}")
                continue

    except Exception as e:
        print(f"Error fetching archive: {e}")

    print(f"✅ Total Articles Fetched: {len(articles)}")
    return articles


In [103]:
x = fetch_dd_news_ugly(3)
dd_db = pd.DataFrame(x)
dd_db

Fetching article 0
Fetching article 1
Fetching article 2
Total Articles Fetched 3


Unnamed: 0,title,text,url,Published_Date,source
0,तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचना...,तटरक्षक बल के चौथे दो दिवसीय वैश्विक शिखर सम्म...,https://ddnews.gov.in/india-underlines-its-con...,12/09/25,DD News
1,पूर्वोत्तर प्रगति की प्रतीक्षा कर रहा सीमांत क...,प्रधानमंत्री कार्यालय (पीएमओ) की ओर से शुक्रवा...,https://ddnews.gov.in/pm-modi-says-northeast-h...,12/09/25,DD News
2,सी.पी. राधाकृष्णन को उपराष्ट्रपति बनने पर पीएम...,भारत के उपराष्ट्रपति बनने के बाद सीपी राधाकृष्...,https://ddnews.gov.in/pm-modi-congratulated-cp...,12/09/25,DD News


In [105]:
x = fetch_dd_news_fixed(3)
dd_db_a = pd.DataFrame(x)
dd_db_a

Fetched article 1: तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचनात्मक...
Fetched article 2: पूर्वोत्तर प्रगति की प्रतीक्षा कर रहा सीमांत क्षेत...
Fetched article 3: सी.पी. राधाकृष्णन को उपराष्ट्रपति बनने पर पीएम मोद...
✅ Total Articles Fetched: 3


Unnamed: 0,title,text,url,published_date,source
0,तटरक्षक बल के वैश्विक सम्मेलन में भारत ने रचना...,तटरक्षक बल के चौथे दो दिवसीय वैश्विक शिखर सम्म...,https://ddnews.gov.in/india-underlines-its-con...,12/09/25,DD News
1,पूर्वोत्तर प्रगति की प्रतीक्षा कर रहा सीमांत क...,प्रधानमंत्री कार्यालय (पीएमओ) की ओर से शुक्रवा...,https://ddnews.gov.in/pm-modi-says-northeast-h...,12/09/25,DD News
2,सी.पी. राधाकृष्णन को उपराष्ट्रपति बनने पर पीएम...,भारत के उपराष्ट्रपति बनने के बाद सीपी राधाकृष्...,https://ddnews.gov.in/pm-modi-congratulated-cp...,12/09/25,DD News


In [38]:
def striping(example):
    return ' '.join(example.split('\xa0'))

In [39]:
dd_db['text'][0]

NameError: name 'dd_db' is not defined

In [45]:
dd_db['title'] = dd_db['title'].apply(lambda x : striping(x) )
dd_db['text'] = dd_db['text'].apply(lambda x : striping(x) )

In [46]:
dd_db['title'][0]
dd_db['text'][0]

'तटरक्षक बल के चौथे दो दिवसीय वैश्विक शिखर सम्मेलन का शुक्रवार को रोम (इटली) में समापन हुआ है। इस शिखर सम्मेलन में 115 देशों और अंतरराष्ट्रीय संगठनों के प्रतिनिधियों ने हिस्सा लिया, जो तटरक्षक सहयोग के लिए प्रमुख वैश्विक मंच के रूप में इसकी महत्ता को दर्शाता है। भारतीय तटरक्षक बल के महानिदेशक परमेश शिवमणि ने दो सदस्यीय प्रतिनिधिमंडल के साथ हिस्सा लिया। उन्होंने अपने संबोधन में समुद्री सुरक्षा और संरक्षा में भारत की विशेषज्ञता और रचनात्मक भूमिका को रेखांकित किया। समुद्रीसुरक्षा और संरक्षा में भारत की विशेषज्ञता और रचनात्मक भूमिका को किया रेखांकित भारतीय तटरक्षक प्रतिनिधिमंडल ने सम्मेलन की कार्यवाही में सक्रिय रूप से योगदान दिया। आईसीजी के महानिदेशक ने ‘आग के खिलाफ संरक्षक: अग्नि आपात स्थितियों के लिए आईसीजी की सामरिक प्रतिक्रिया’ शीर्षक से एक विशेषज्ञ व्याख्यान दिया। उन्होंने समुद्री सुरक्षा और संरक्षा में भारत की विशेषज्ञता और रचनात्मक भूमिका को रेखांकित किया। भारतीय तटरक्षक बल ने 2027 में होने वाले 5वें तटरक्षक वैश्विक शिखर सम्मेलन की अध्यक्षता के लिए दावेदारी करने की इच्छा जताई है, ज

In [43]:
import json
# def fetch_google_facts(query,num_iter = 1, pages = 100):
# def fetch_dd_news(max_articles=20, delay=1):
# def fetch_news_org(query:str , page_size:int = 100,num_iter = 12,sort_by_index:int = 0):
# df_ret --> wiki dataset

class fetch_all:
    def __init__(self, num_pages = 100 , num_iter = 1):
        self.num_pages  =num_pages
        self.num_iter  = num_iter
        self.articles  = []

    def dd_news(self,max_articles):
        print("DD News fetching, Just Pray their Server Dont die")
        try:
            self.articles.extend(fetch_dd_news_fixed(max_articles))
        except Exception  as e:
            print("DD News failed | ", e)

    def google(self,query = 'india'):
        print("Google Fetching")
        self.articles.extend(fetch_google_facts(query , self.num_iter , self.num_pages))

    def news_org(self, query = 'india' ,sort_idx = 0):
        print("News API fetching")
        self.articles.extend(fetch_news_org(query=query , page_size=self.num_pages , num_iter=self.num_iter , sort_by_index=sort_idx))

    def wiki(self):
        print("Wiki fetching")
        self.articles.extend(df_ret.to_dict('records'))

    def to_pandas(self):
        return pd.DataFrame(self.articles)
    
    def ndtv(self):
        self.articles.extend(get_ndtv_rss_feeds())

    def to_json(self):
        return json.dumps(self.articles)
    
    



In [53]:
db = fetch_all(100,500)
db.news_org('india',0)
db.wiki()
db.google('indian')
db.ndtv()
df = db.to_pandas()
djson = db.to_json()
db.dd_news(10000)
df = db.to_pandas()
djson = db.to_json()

News API fetching
status | ok
Request error on page(iteration) 2: 426 Client Error: Upgrade Required for url: https://newsapi.org/v2/everything?apiKey=566b5a11ed79400e83276b9c4655c3f2&q=india&sortBy=relevancy&pageSize=100&page=2
Fetched 100 news articles
Wiki fetching
Google Fetching


KeyboardInterrupt: 

In [52]:
df.shape

(263327, 8)

In [65]:
df_ret['Published_Date']  = "Not Available for Wikipedia"

In [66]:
df_ret

Unnamed: 0,url,title,text,source,Published_Date
0,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...,Wikipedia,Not Available for Wikipedia
1,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the fraction of sunlight that i...,Wikipedia,Not Available for Wikipedia
2,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow...",Wikipedia,Not Available for Wikipedia
3,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...,Wikipedia,Not Available for Wikipedia
4,https://en.wikipedia.org/wiki/Achilles,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ...",Wikipedia,Not Available for Wikipedia
...,...,...,...,...,...
193751,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.,Wikipedia,Not Available for Wikipedia
193752,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.,Wikipedia,Not Available for Wikipedia
193753,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.,Wikipedia,Not Available for Wikipedia
193754,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.,Wikipedia,Not Available for Wikipedia
