In [1]:
#importing libraries 
import feedparser
import pandas as pd
import requests
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
import sqlite3

In [2]:
#functions to get different fields by parsing html files using beautifulsoup4 

#extracting title by finding 'h1'
def get_title(soup):
    try:
        return soup.find("h1").text.strip()
    except:
        return "Unknown"

#extracting summary by getting first 3 'p' tags and joining them
def get_summary(soup):
    try:
        paragraphs = soup.find_all("p")
        return " ".join(p.text.strip() for p in paragraphs[:3]) if paragraphs else "Unknown"
    except:
        return "Unknown"

#extracting published date by meta tag or time tag
def get_published(soup):
    try:
        meta_time = (
            soup.find("meta", {"property": "article:published_time"}) or
            soup.find("meta", {"name": "pubdate"})
        )
        if meta_time and meta_time.get("content"):
            return meta_time["content"]
        time_tag = soup.find("time")
        if time_tag and time_tag.get("datetime"):
            return time_tag["datetime"]
    except:
        pass
    return "Unknown"

#extracting the source of the article by meta tag
def get_source(soup):
    try:
        meta = soup.find("meta", property="og:site_name")
        return meta["content"].strip() if meta and meta.get("content") else "Unknown"
    except:
        return "Unknown"


#extracting author of the article by meta tag or 'author-name' class in span tag
def get_author(soup):
    try:
        meta_author = soup.find("meta", attrs={"name": "author"})
        if meta_author and meta_author.get("content"):
            return meta_author["content"].strip()
        author_tag = soup.find("span", class_="author-name")
        if author_tag:
            return author_tag.text.strip()
    except:
        pass
    return "Unknown"


#also assigning default values as "unknown" if not able to find appropriate values

In [3]:

#function for detecting language of the article using 'title'
DetectorFactory.seed = 0  

def detect_language(text):
    try:
        if text and text.strip():  
            return detect(text)
        else:
            return "unknown"
    except:
        return "unknown"

In [4]:

#function for extracting title, summary, published_date, source, author, link of the article 


def rss_entry(entry, feed_title, country, headers):
    title = getattr(entry, 'title', None)
    summary = getattr(entry, 'summary', None)
    published = getattr(entry, 'published', None)
    source = feed_title
    author = getattr(entry, 'author', None) or getattr(entry, 'dc_creator', None)
    link = getattr(entry, 'link', 'Unknown')

    #if not found in the rss feeds, we use request and bs4 for parsing the link of that particular article and try to extract the missing fields
    #if still not found , assigning default value as "unknown"
    soup = None
    if not (title and summary and published and author and source) and link != "Unknown":
        try:
            response = requests.get(link, headers=headers, timeout=10)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, "html.parser")
        except:
            soup = None

    if not title and soup:
        title = get_title(soup)
    if not summary and soup:
        summary = get_summary(soup)
    if not published and soup:
        published = get_published(soup)
    if not source and soup:
        source = get_source(soup)
    if not author and soup:
        author = get_author(soup)

    #returning the news_item , this also includes the language along with the other parsed fields
    return {
        'News Title': title or "Unknown",
        'Publication Date': published or "Unknown",
        'Source (News Agency)': source or "Unknown",
        'Country': country,
        'Summary': summary or "Unknown",
        'Link to full Article': link,
        'Author': author or "Unknown",
        'Language':detect_language(title)
        
    }

#function for parsing rss feeds by looping through the dictionary and using 'feedparser' for parsing
#we call the rss_entry function for extracting the fields and creating a news_item to add in the list

def parse_rss_feeds(rss_feeds, headers=None):
    if headers is None:
        headers = {"User-Agent": "Mozilla/5.0"}

    all_news = []
    for country, urls in rss_feeds.items():
        for url in urls:
            try:
                f = feedparser.parse(url)
                for entry in f.entries:
                    try:
                        news_item = rss_entry(entry, getattr(f.feed, 'title', 'Unknown'), country, headers)
                        all_news.append(news_item)
                    except Exception as e:
                        print(f"Error processing entry from {url}: {e}")
                        continue
            except Exception as e:
                print(f"Error parsing feed {url}: {e}")
                continue
    return all_news


In [5]:

#function for handling the historical data
#using bs4 for parsing links of some articles that were published more than a year ago

def scrape_historical_articles(historical_urls):
    headers = {"User-Agent": "Mozilla/5.0"}
    all_news = []

    for country, urls in historical_urls.items():
        for url in urls:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(response.text, "html.parser")

                # Using functions to extract the data from the soup
                title = get_title(soup)
                summary = get_summary(soup)
                published = get_published(soup)
                source = get_source(soup)
                author = get_author(soup)

                #creating the news_item to add in the list , including the language of the article detected along with other parsed fields
                news_item = {
                    'News Title': title,
                    'Publication Date': published,
                    'Source (News Agency)': source,
                    'Country': country,
                    'Summary': summary,
                    'Link to full Article': url,
                    'Author': author,
                    'Language': detect_language(title)
                }

                all_news.append(news_item)

            except Exception as e:
                print(f"Historical scrape error for {url}: {e}")
                continue

    return all_news



In [6]:

#this is a dictionary of rss feeds 
#with country as key and a list of links for different newspapers as its value
#creating this as parsing country was not possible through rss feeds(feedparser) or html parsing(bs4)
#this contains 20+ countries data with 30+ news sites
rss_feeds = {
    "UK": [
        "http://feeds.bbci.co.uk/news/rss.xml"
    ],
    "USA": [
        "http://rss.cnn.com/rss/edition.rss",
        "https://www.vox.com/rss/index.xml",
        "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
    ],
    "Japan": [
        "https://www3.nhk.or.jp/rss/news/cat0.xml"
    ],
    "India": [
        "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms"
    ],
    "Germany": [
        "https://rss.dw.com/rdf/rss-en-all",
        "https://www.spiegel.de/schlagzeilen/index.rss"
    ],
    "France": [
        "https://www.lemonde.fr/rss/une.xml"
    ],
    "Russia": [
        "https://tass.com/rss/v2.xml",
        "https://vc.ru/rss/all"
    ],
    "Australia": [
        "https://www.abc.net.au/news/feed/51120/rss.xml"
    ],
    "Canada": [
        "https://www.cbc.ca/cmlink/rss-world"
    ],
    "China": [
        "http://www.xinhuanet.com/english/rss/worldrss.xml"
    ],
    "Brazil": [
        "https://g1.globo.com/rss/g1/"
    ],
    "Middle East": [
        "https://www.aljazeera.com/xml/rss/all.xml"
    ],
    "Netherlands": [
        "https://www.bellingcat.com/feed/"
    ],
    "Turkey": [
        "https://www.ahaber.com.tr/rss/dunya.xml"
    ],
    "Vietnam": [
        "https://vnexpress.net/rss/the-thao.rss"
    ],
    "South Korea": [
        "http://www.koreatimes.co.kr/www/rss/biz.xml"
    ],
    "Thailand": [
        "https://www.bangkokpost.com/rss/data/topstories.xml"
    ],
    "Sri Lanka": [
        "http://www.gossiplankahotnews.com/feeds/posts/default/-/Hotnews"
    ],
    "Hong Kong": [
        "https://hongkongclimbing.com/feed/"
    ],
    "South Africa": [
        "https://www.theguardian.com/media/dailymail/rss"
    ],
    "Pakistan": [
        "http://www.dawn.com/feeds/home"
    ],
    "Ukraine": [
        "http://euromaidanpress.com/feed/"
    ],
    "Qatar": [
        "https://dohanews.co/feed/"
    ],
    "Hungary": [
        "https://dailynewshungary.com/feed/"
    ],
    "Finland": [
        "http://dailyfinland.fi/feed/latest-rss.xml"
    ],
    "Denmark": [
        "http://www.esa.int/rssfeed/Denmark"
    ],
    "Portugal": [
        "https://balcao.portugal2020.pt/nb.balcao2020.ui/concurso/rss/"
    ],
    "Zimbabwe": [
        "http://www.thezimbabwean.co/feed/"
    ],
    "Iraq": [
        "http://www.iraq-businessnews.com/feed/"
    ],
    "Greece": [
        "https://gr.gizchina.com/feed/"
    ],
    "Italy": [
        "https://www.repubblica.it/rss/homepage/rss2.0.xml"
    ]
}


#this is the dictionary of urls of the articles that were published more than a year ago
#due to time constraint only 5 articles are gathered here
#this is also in the same format as country as key and list of links as its value

historical_urls = {
    "UK": [
        "https://www.bbc.com/news/articles/cp00jze920eo",
        "https://edition.cnn.com/2020/05/14/uk/united-kingdom-divided-approach-coronavirus-intl-gbr"
    ],
    "USA": [
        "https://edition.cnn.com/2023/01/31/tennis/atp-alexander-zverev-domestic-abuse-spt-intl/index.html"
    ],
    "India": [
        "https://timesofindia.indiatimes.com/india/2022-when-the-world-shifted-on-its-axis/articleshow/96416470.cms"
    ],
    "Middle East": [
        "https://network.aljazeera.net/en/pressroom/former-cia-chief-%E2%80%9Cyes%E2%80%9D-donald-trump-%E2%80%9Crecruiting-sergeant%E2%80%9D-isil-0"
    ]
}


In [7]:

#calling both the functions and creating a list with both the outputs
all_news = parse_rss_feeds(rss_feeds) + scrape_historical_articles(historical_urls)

#creating a dataframe using pandas
df = pd.DataFrame(all_news)
df.fillna('Unknown', inplace=True)

#handling repeated data if any based on the title and link of the article
df.drop_duplicates(subset=['News Title', 'Link to full Article'], inplace=True)

#creating a csv file
df.to_csv('all_news.csv', index=False, encoding='utf-8')


Error parsing feed https://www.cbc.ca/cmlink/rss-world: Remote end closed connection without response


In [8]:
# Connect to SQLite database 
conn = sqlite3.connect('news_data.db')  
cursor = conn.cursor()



In [9]:
# Create table if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS news (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        publication_date TEXT,
        source TEXT,
        country TEXT,
        summary TEXT,
        link TEXT ,
        author TEXT,
        language TEXT
    )
''')


# Inserting data into the table
for _, row in df.iterrows():
    try:
        #extracting data from the data frame and inserting into the table in database of SQLite
        cursor.execute('''
            INSERT OR IGNORE INTO news 
            (title, publication_date, source, country, summary, link, author, language)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            row['News Title'],
            row['Publication Date'],
            row['Source (News Agency)'],
            row['Country'],
            row['Summary'],
            row['Link to full Article'],
            row['Author'],
            row['Language']
        ))
    except Exception as e:
        print(f"Insert failed: {e}")



In [10]:
conn.commit()
conn.close()


In [11]:

#checking after insertion , if the records were added successfully or not
conn = sqlite3.connect('news_data.db')
df2 = pd.read_sql_query("SELECT * FROM news", conn)

conn.close()

df2


Unnamed: 0,id,title,publication_date,source,country,summary,link,author,language
0,1,UK prosecutors say 21 charges authorised again...,"Wed, 28 May 2025 16:30:06 GMT",BBC News,UK,Prosecutors in the UK confirm they have author...,https://www.bbc.com/news/articles/ckg41g1140po,Unknown,en
1,2,The terrifying new weapon changing the war in ...,"Wed, 28 May 2025 16:00:06 GMT",BBC News,UK,Swarms of fibre optic drones give Russia the e...,https://www.bbc.com/news/articles/ckgn47e5qyno,Unknown,en
2,3,"Heathrow chief asleep as airport closed, inqui...","Wed, 28 May 2025 16:10:06 GMT",BBC News,UK,Findings from a review commissioned by Heathro...,https://www.bbc.com/news/articles/c62n0y3nepzo,Unknown,en
3,4,French paedophile surgeon who abused hundreds ...,"Wed, 28 May 2025 12:45:32 GMT",BBC News,UK,"Joel Le Scouarnec, who has admitted his guilt,...",https://www.bbc.com/news/articles/cvgdkyge198o,Unknown,en
4,5,BBC Verify examines footage of chaos at aid si...,"Wed, 28 May 2025 16:04:34 GMT",BBC News,UK,The UN Human Rights Office has said it believe...,https://www.bbc.com/news/videos/cvgdkgmn3yxo,Unknown,en
...,...,...,...,...,...,...,...,...,...
965,967,India’s opposition looked down and out – now t...,2024-06-05T16:18:18.822Z,Unknown,UK,The results of India’s general election announ...,https://www.bbc.com/news/articles/cp00jze920eo,Unknown,en
966,968,The United Kingdom’s four countries take a div...,2020-05-14T06:02:05Z,CNN,UK,The UK’s coronavirus crisis has reignited one ...,https://edition.cnn.com/2020/05/14/uk/united-k...,Luke McGee,en
967,969,ATP says no disciplinary action to be taken ag...,2023-01-31T13:23:23Z,CNN,USA,The ATP Tour announced Tuesday that there will...,https://edition.cnn.com/2023/01/31/tennis/atp-...,Jill Martin,en
968,970,Unknown,Unknown,The Times of India,India,"As conflicts become the new normal, countries ...",https://timesofindia.indiatimes.com/india/2022...,Unknown,en
