In [None]:


def get_et_markets_news():
    url = "https://economictimes.indiatimes.com/markets/newslist/2419376.cms"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "application/json",
        "Referer": "https://economictimes.indiatimes.com/"
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises exception for 4XX/5XX errors
        
        # Check if response is actually JSON
        if 'application/json' not in response.headers.get('Content-Type', ''):
            raise ValueError("Response is not JSON")
            
        data = response.json()
        
        for item in data.get('news', []):
            print(item.get('headline'))
            
    except Exception as e:
        print(f"Error: {e}")
        print(f"Response content: {response.text[:200]}...")  # Print first 200 chars

get_et_markets_news()

Error: Response is not JSON
Response content: <!DOCTYPE html><html xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:valurl="com.times.utilities.CMSWebUtility" xmlns:nohtml="com.til.utils.CommonUtils" xmlns:listval="com.indiatimes.cms.utilities.CM...


In [28]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_et_tata_news(base_url, max_pages=10):  # ← set max pages
    news = []
    page = 1

    while page <= max_pages:
        url = base_url + ('' if page == 1 else f"?page={page}")
        print(f"Scraping: {url}")
        r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(r.text, 'html.parser')

        items = soup.select('div.eachStory')
        if not items:
            print("No more news found.")
            break

        for item in items:
            title_elem = item.select_one('h3 a')
            if not title_elem:
                continue

            title = title_elem.text.strip()
            link = 'https://economictimes.indiatimes.com' + title_elem['href']
            date_elem = item.select_one('time')
            date = date_elem['datetime'] if date_elem and date_elem.has_attr('datetime') else (
                date_elem.text.strip() if date_elem else None)

            news.append({'title': title, 'date': date})

        page += 1
        time.sleep(1)

    return pd.DataFrame(news)

# Run with limit
df = scrape_et_tata_news("https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms", max_pages=20)
df.to_csv("et_tata_motors_news.csv", index=False)
print(f"✅ Total articles collected: {len(df)}")


Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=2
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=3
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=4
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=5
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=6
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=7
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=8
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?page=9
Scraping: https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/comp

In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

rss_url = "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms"

r = requests.get(rss_url)
soup = BeautifulSoup(r.content, features="xml")

items = soup.findAll("item")

news_data = []
for item in items:
    title = item.title.text
    pub_date = item.pubDate.text
    link = item.link.text
    news_data.append({"title": title, "date": pub_date, "link": link})

df = pd.DataFrame(news_data)
df.to_csv("et_tata_motors_rss.csv", index=False)
print(f"✅ Collected {len(df)} RSS news items.")


✅ Collected 50 RSS news items.


In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_easyreader(pages=10):
    all_news = []
    headers = {"User-Agent": "Mozilla/5.0"}

    for page in range(1, pages + 1):
        url = f"https://economictimes.indiatimes.com/tata-motors-ltd/stocksupdate/companyid-12934.cms?curpg={page}"
        print(f"Scraping page {page}...")

        r = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(r.text, 'html.parser')
        items = soup.select('div.eachStory')

        if not items:
            print("No articles found on this page.")
            continue

        for art in items:
            title_elem = art.select_one('h3 a')
            date_elem = art.select_one('time')

            title = title_elem.text.strip() if title_elem else None
            link = "https://economictimes.indiatimes.com" + title_elem['href'] if title_elem else None

            date_text = (
                date_elem['datetime'] if date_elem and date_elem.has_attr('datetime')
                else (date_elem.text.strip() if date_elem else None)
            )

            if title:
                all_news.append({
                    "title": title,
                    "date": date_text
                })

        time.sleep(1)

    return pd.DataFrame(all_news)

# Call function
df = scrape_easyreader(pages=100)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date']).drop_duplicates(subset=['title'])

df.to_csv("tata_motors_scraped_news.csv", index=False)
print(f"✅ Scraped {len(df)} unique news articles")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df['date'] = pd.to_datetime(df['date'], errors='coerce