# Scrape Times Of India website using RSS Feed


## Disclaimer & Warning: 
### ⚠️ <span style="color:red"> Warning: Scrape Responsibly <span> ⚠️

You must **NOT** scrape the website at a very high frequency. Excessive requests may  
lead to your IP being blocked or other restrictions imposed by the website.  

Please scrape **responsibly and mindfully**. You are **solely responsible** for any  
consequences resulting from your scraping activities.

> **Disclaimer:** This notebook is provided for educational purposes only.  
> The author is not responsible for any misuse or violation of the website’s  
> terms of service.


Times Of India - [RSS Feed](https://timesofindia.indiatimes.com/rss.cms) 

# Part-1: Scrape Articles 

## 1.1 Select Your Feed Name: List of Feeds provided by Times of India

In [1]:
rss_feed_links = {
    "Top Stories": "http://timesofindia.indiatimes.com/rssfeedstopstories.cms",
    "Most Recent Stories": "http://timesofindia.indiatimes.com/rssfeedmostrecent.cms",
    "India": "http://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",
    "World": "http://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
    "NRI": "http://timesofindia.indiatimes.com/rssfeeds/7098551.cms",
    "Business": "http://timesofindia.indiatimes.com/rssfeeds/1898055.cms",
    "US": "https://timesofindia.indiatimes.com/rssfeeds_us/72258322.cms",
    "Cricket": "http://timesofindia.indiatimes.com/rssfeeds/54829575.cms",
    "Sports": "http://timesofindia.indiatimes.com/rssfeeds/4719148.cms",
    "Science": "http://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms",
    "Environment": "http://timesofindia.indiatimes.com/rssfeeds/2647163.cms",
    "Tech": "http://timesofindia.indiatimes.com/rssfeeds/66949542.cms",
    "Education": "http://timesofindia.indiatimes.com/rssfeeds/913168846.cms",
    "Entertainment": "http://timesofindia.indiatimes.com/rssfeeds/1081479906.cms",
    "Life & Style": "http://timesofindia.indiatimes.com/rssfeeds/2886704.cms",
    "Most Read": "http://timesofindia.indiatimes.com/rssfeedmostread.cms",
    "Most Shared": "http://timesofindia.indiatimes.com/rssfeedmostshared.cms",
    "Most Commented": "http://timesofindia.indiatimes.com/rssfeedmostcommented.cms",
    "Astrology": "https://timesofindia.indiatimes.com/rssfeeds/65857041.cms",
    "Auto": "https://timesofindia.indiatimes.com/rssfeeds/74317216.cms"
}

feed_name = "World"


## 1.2 Get Article Links and Article Text

In [2]:
import requests
import xml.etree.ElementTree as ET
import re
import time
import datetime
import os
import json

In [3]:


# Generated from GPT
def getArticleLinks(rss_url="https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms"):
    """
    Fetch all article links from a Times of India RSS feed.

    Args:
        rss_url (str): RSS feed URL. Defaults to a sample TOI RSS feed.

    Returns:
        list: List of article URLs.
    """
    try:
        resp = requests.get(rss_url)
        resp.raise_for_status()  # Raise error if request fails

        root = ET.fromstring(resp.content)

        # Extract links from <item> tags
        links = [item.find("link").text for item in root.findall("./channel/item") if item.find("link") is not None]
        return links

    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return []
    except ET.ParseError as e:
        print(f"Failed to parse XML: {e}")
        return []



def fetchArticleText(url):
    """
    Fetch and clean articles from a list of Times of India links.    
    Args:
        links (list): List of article URLs.    
    Returns:
        list: List of cleaned article texts.
    """
    article_text = ""

    try:
        r = requests.get(url)
        r.raise_for_status()
        response_text = r.text

        # Extract articleBody using regex
        pattern = r'"articleBody"\s*:\s*"((?:[^"\\]|\\.)*?)"'
        match = re.search(pattern, response_text)
        if match:
            article = match.group(1)
            # Remove non-ASCII characters
            clean_article = re.sub(r'[^\x00-\x7F]+', '', article)
            # Remove escaped double quotes from string \"
            clean_article = clean_article.replace(r'\"', '"')
            article_text = clean_article
        else:
            # print(f"No articleBody found for URL: {url}")
            article_text = "ERROR_FAILED_TO_SCRAPE_ARTICLE"

    except requests.RequestException as e:
        print(f"Failed to fetch article {url}: {e}")
    
    return article_text



In [4]:
# getArticleLinks(rss_feed_links.get("Business"))
# getArticleLinks(rss_feed_links.get("Entertainment"))
# getArticleLinks(rss_feed_links.get("World"))
# getArticleLinks(rss_feed_links.get("NRI"))

all_links = getArticleLinks(rss_feed_links.get(feed_name))
print(f"Found number of Links: {len(all_links)} links in feed topic = {feed_name}")

Found number of Links: 20 links in feed topic = World


In [5]:
#links = all_links[0:4]

links = all_links

articles_text = ""

for i in range(len(links)):
    link = links[i]
    print(f"Reading Link: {link}")
    article = fetchArticleText(link)
    time.sleep(2)
    articles_text = articles_text + article + "\n" + "--"*60 + "\n" 
    #print(article)
    #print("--"*60)

article_file_path =  "Output/toi-articles/" + "toi-article-" + feed_name + "-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+".txt"

with open(article_file_path, 'a') as f:
    f.write(articles_text)

Reading Link: https://timesofindia.indiatimes.com/world/middle-east/qatar-introduces-shorter-working-hours-for-qatari-mothers-in-government-jobs/articleshow/124117859.cms
Reading Link: https://timesofindia.indiatimes.com/world/middle-east/dubai-unveils-bold-plan-to-cut-school-fees-and-make-education-more-affordable-for-families/articleshow/124114716.cms
Reading Link: https://timesofindia.indiatimes.com/world/us/charlie-kirk-assassination-suspects-transgender-boyfriend-reportedly-kicked-out-of-home-experimented-with-drugs-and-alcohol-in-youth/articleshow/124115586.cms
Reading Link: https://timesofindia.indiatimes.com/world/europe/ex-french-president-sarkozy-convicted-5-year-term-in-libya-campaign-case-aides-also-jailed/articleshow/124116438.cms
Reading Link: https://timesofindia.indiatimes.com/world/china/taiwan-sentences-4-ex-ruling-party-members-on-china-spying/articleshow/124112192.cms
Reading Link: https://timesofindia.indiatimes.com/world/uk/is-london-the-sharia-capital-decoding-do

# Part-2: Scrape Comments 

## 2.1 Get list of CMS ID: 


In [7]:
article_cms_ids = [re.search(r'/articleshow/(\d+)\.cms', u).group(1) for u in all_links]
print(f"Found number of Links: {len(article_cms_ids)} links in feed topic = {feed_name}")

Found number of Links: 20 links in feed topic = World


In [8]:
# URL template
template = ("https://timesofindia.indiatimes.com/commentsdata.cms"
            "?msid={cms_id}&curpg=1&commenttype=agree&pcode=TOI&appkey=TOI"
            "&sortcriteria=AgreeCount&order=desc&size=10&after=true"
            "&withReward=true&medium=WEB&comment_block_count=3&pagenum=1")

# Generate comment URLs
# comment_urls = [template.format(cms_id=i) for i in article_cms_ids]
# print(f"Number of Comments Links: {len(comment_urls)} links in feed topic = {feed_name}")

# Generate list of dictionaries with cms_id and corresponding comment URL
comment_urls = [{"cms_id": cms_id, "cms_link": template.format(cms_id=cms_id)} 
                 for cms_id in article_cms_ids]

print(f"Number of Comments Links: {len(comment_urls)} links in feed topic = {feed_name}\n")

# Optional: print first 3 for verification
for link in comment_urls[:3]:
    print(link)

Number of Comments Links: 20 links in feed topic = World

{'cms_id': '124117859', 'cms_link': 'https://timesofindia.indiatimes.com/commentsdata.cms?msid=124117859&curpg=1&commenttype=agree&pcode=TOI&appkey=TOI&sortcriteria=AgreeCount&order=desc&size=10&after=true&withReward=true&medium=WEB&comment_block_count=3&pagenum=1'}
{'cms_id': '124114716', 'cms_link': 'https://timesofindia.indiatimes.com/commentsdata.cms?msid=124114716&curpg=1&commenttype=agree&pcode=TOI&appkey=TOI&sortcriteria=AgreeCount&order=desc&size=10&after=true&withReward=true&medium=WEB&comment_block_count=3&pagenum=1'}
{'cms_id': '124115586', 'cms_link': 'https://timesofindia.indiatimes.com/commentsdata.cms?msid=124115586&curpg=1&commenttype=agree&pcode=TOI&appkey=TOI&sortcriteria=AgreeCount&order=desc&size=10&after=true&withReward=true&medium=WEB&comment_block_count=3&pagenum=1'}


## 2.2 Scrape comments and write to Json files: 


In [9]:

comments_file_path = "Output/toi-comments/" + "toi-comments-" + feed_name + "/"

# Ensure the output folder exists
os.makedirs(comments_file_path, exist_ok=True)

# Fetch comments JSON for each article and store in the dictionary
for entry in comment_urls:
    try:
        r = requests.get(entry["cms_link"], timeout=10)
        r.raise_for_status()  # raise error if status != 200
        data = r.json()
 
        # Determine if there are comments
        has_comments = False
        if isinstance(data, dict):
            has_comments = bool(data.get('items'))
        elif isinstance(data, list):
            has_comments = len(data) > 0

        # Write to file only if comments exist
        if has_comments:
            file_name = f"cms-{entry['cms_id']}-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
            file_path = os.path.join(comments_file_path, file_name)

            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            print(f"Written comments for CMS ID {entry['cms_id']} to {file_path}")
        else:
            print(f"No comments for CMS ID {entry['cms_link']}")
        
        time.sleep(2)  # polite delay between requests
        
    except Exception as e:
        print(f"Failed to fetch CMS ID {entry['cms_link']}: {e}")
        entry["comments_json"] = None

No comments for CMS ID 124117859
No comments for CMS ID 124114716
No comments for CMS ID 124115586
Written comments for CMS ID 124116438 to Output/toi-comments/toi-comments-World/cms-124116438-20250925-175841.json
No comments for CMS ID 124112192
Written comments for CMS ID 124115375 to Output/toi-comments/toi-comments-World/cms-124115375-20250925-175846.json
No comments for CMS ID 124114733
No comments for CMS ID 124112026
No comments for CMS ID 124113127
No comments for CMS ID 124110368
No comments for CMS ID 124110302
No comments for CMS ID 124111862
No comments for CMS ID 124110312
No comments for CMS ID 124108814
No comments for CMS ID 124108879
No comments for CMS ID 124108449
No comments for CMS ID 124108001
No comments for CMS ID 124107493
No comments for CMS ID 124106097
No comments for CMS ID 124106574
