Skip to content
This repository was archived by the owner on Dec 22, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Scripts/Web_Scrappers/Economictimes_Scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

### Fetch all News Articles from Economictimes starting from a given input start date and end date.

### How to use this script?

1. Make sure all the requirements for the script are present in your system by running:

`pip install -r requirements.txt`

2. Run the following command:

`python economictimes_scraper.py START_DATE END_DATE`
where date format is `YYYY-MM-DD`

3. Example Usage
`python economictimes_scraper.py 2020-05-15 2020-05-20`
Output will be saved in file `ET_NewsData_STARTDATE_ENDDATE.json`


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a screenshot of the script in action

### Author

[Jayesh Narwaria](https://github.com/jaynarw)
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from bs4 import BeautifulSoup
import lxml
import requests
import json
import datetime
import sys

# Util
def datestr_to_date(datestr):
[year, month, day] = datestr.split('-')
return datetime.date(
year=int(year),
month=int(month),
day=int(day)
)

# Reference dates
reference_date = datetime.date(2001, 1, 1) # 2001 Jan 1
reference_date_id = 36892

if len(sys.argv) < 3:
print('economictimes_scraper.py START_DATE END_DATE\nDate fmt: YYYY-MM-DD')
sys.exit(1)

start_date = datestr_to_date(sys.argv[1])
end_date = datestr_to_date(sys.argv[2])
start_dateid = reference_date_id + (start_date - reference_date).days
end_dateid = reference_date_id + (end_date - reference_date).days

if (start_date - reference_date).days < 0:
print('Error: Start date should be > than 2001-01-01')
sys.exit(1)
if (end_date - start_date).days < 0:
print('Error: End date should be > than Start date')
sys.exit(1)


# Gets News article metadata from article url
def fetchNewsArticle(url):
html = requests.get(url).content
root = lxml.HTML(html)
x = root.xpath("/html/body//script[@type='application/ld+json']")
metadata = None ## When Article does not exists (404)
if (len(x) >= 2):
metadata = x[1].text
return metadata

et_host = 'https://economictimes.indiatimes.com'
et_date_url = 'https://economictimes.indiatimes.com/archivelist/starttime-'
et_date_extension = '.cms'

fetched_data = {}

for dateid in range(start_dateid, end_dateid + 1):
date = str(reference_date + datetime.timedelta(days = dateid-reference_date_id))
html = requests.get('{}{}{}'.format(et_date_url, dateid, et_date_extension)).content
soup = BeautifulSoup(html, 'html.parser')
fetched_data[date] = []
for x in soup.select('#pageContent table li a'):
print(x.text)
article_metadata = fetchNewsArticle(et_host + x['href'])
fetched_data[date].append({
"metadata": article_metadata,
"title": x.text,
"url": et_host + x['href']
})

out_filename = 'ET_NewsData_{}_{}.json'.format(start_date, end_date)
with open(out_filename, 'w+') as output_file:
output_file.write(json.dumps(fetched_data, indent=2))
3 changes: 3 additions & 0 deletions Scripts/Web_Scrappers/Economictimes_Scraper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests==2.23.0
beautifulsoup4==4.9.3
lxml==4.5.2