diff --git a/Scripts/Web_Scrappers/Economictimes_Scraper/README.md b/Scripts/Web_Scrappers/Economictimes_Scraper/README.md new file mode 100644 index 000000000..18e860389 --- /dev/null +++ b/Scripts/Web_Scrappers/Economictimes_Scraper/README.md @@ -0,0 +1,22 @@ + +### Fetch all News Articles from Economictimes starting from a given input start date and end date. + +### How to use this script? + +1. Make sure all the requirements for the script are present in your system by running: + + `pip install -r requirements.txt` + +2. Run the following command: + + `python economictimes_scraper.py START_DATE END_DATE` + where date format is `YYYY-MM-DD` + +3. Example Usage +`python economictimes_scraper.py 2020-05-15 2020-05-20` +Output will be saved in file `ET_NewsData_STARTDATE_ENDDATE.json` + + +### Author + +[Jayesh Narwaria](https://github.com/jaynarw) \ No newline at end of file diff --git a/Scripts/Web_Scrappers/Economictimes_Scraper/economictimes_scraper.py b/Scripts/Web_Scrappers/Economictimes_Scraper/economictimes_scraper.py new file mode 100644 index 000000000..33a9eaa55 --- /dev/null +++ b/Scripts/Web_Scrappers/Economictimes_Scraper/economictimes_scraper.py @@ -0,0 +1,70 @@ +from bs4 import BeautifulSoup +import lxml +import requests +import json +import datetime +import sys + +# Util +def datestr_to_date(datestr): + [year, month, day] = datestr.split('-') + return datetime.date( + year=int(year), + month=int(month), + day=int(day) + ) + +# Reference dates +reference_date = datetime.date(2001, 1, 1) # 2001 Jan 1 +reference_date_id = 36892 + +if len(sys.argv) < 3: + print('economictimes_scraper.py START_DATE END_DATE\nDate fmt: YYYY-MM-DD') + sys.exit(1) + +start_date = datestr_to_date(sys.argv[1]) +end_date = datestr_to_date(sys.argv[2]) +start_dateid = reference_date_id + (start_date - reference_date).days +end_dateid = reference_date_id + (end_date - reference_date).days + +if (start_date - reference_date).days < 0: + print('Error: Start date should be > than 2001-01-01') + sys.exit(1) +if (end_date - start_date).days < 0: + print('Error: End date should be > than Start date') + sys.exit(1) + + +# Gets News article metadata from article url +def fetchNewsArticle(url): + html = requests.get(url).content + root = lxml.HTML(html) + x = root.xpath("/html/body//script[@type='application/ld+json']") + metadata = None ## When Article does not exists (404) + if (len(x) >= 2): + metadata = x[1].text + return metadata + +et_host = 'https://economictimes.indiatimes.com' +et_date_url = 'https://economictimes.indiatimes.com/archivelist/starttime-' +et_date_extension = '.cms' + +fetched_data = {} + +for dateid in range(start_dateid, end_dateid + 1): + date = str(reference_date + datetime.timedelta(days = dateid-reference_date_id)) + html = requests.get('{}{}{}'.format(et_date_url, dateid, et_date_extension)).content + soup = BeautifulSoup(html, 'html.parser') + fetched_data[date] = [] + for x in soup.select('#pageContent table li a'): + print(x.text) + article_metadata = fetchNewsArticle(et_host + x['href']) + fetched_data[date].append({ + "metadata": article_metadata, + "title": x.text, + "url": et_host + x['href'] + }) + +out_filename = 'ET_NewsData_{}_{}.json'.format(start_date, end_date) +with open(out_filename, 'w+') as output_file: + output_file.write(json.dumps(fetched_data, indent=2)) diff --git a/Scripts/Web_Scrappers/Economictimes_Scraper/requirements.txt b/Scripts/Web_Scrappers/Economictimes_Scraper/requirements.txt new file mode 100644 index 000000000..d59799a1c --- /dev/null +++ b/Scripts/Web_Scrappers/Economictimes_Scraper/requirements.txt @@ -0,0 +1,3 @@ +requests==2.23.0 +beautifulsoup4==4.9.3 +lxml==4.5.2