# Sachin Jeshani

# Web - Scraping news articles for OCR

- Importing useful libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import selenium
import os
import urllib.request
import shutil
import time

import warnings
warnings.filterwarnings("ignore")

- Start date and end date

In [2]:
import datetime

start = datetime.datetime.strptime("2021-01-18", "%Y-%m-%d")
end = datetime.datetime.strptime("2021-01-20", "%Y-%m-%d")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days+1)]
date_str = [str(d.date()) for d in date_generated]

date_str

['2021-01-18', '2021-01-19', '2021-01-20']

- List of urls

In [3]:
urls = []
base_url = 'http://epaper.kutchmitradaily.com/viewpage.php?edition=Kutchmitra%20Main&edid=KUTCHMITRA_KUT&date='

for d in date_str:
    url = base_url+d
    urls.append(url)
    
urls

['http://epaper.kutchmitradaily.com/viewpage.php?edition=Kutchmitra%20Main&edid=KUTCHMITRA_KUT&date=2021-01-18',
 'http://epaper.kutchmitradaily.com/viewpage.php?edition=Kutchmitra%20Main&edid=KUTCHMITRA_KUT&date=2021-01-19',
 'http://epaper.kutchmitradaily.com/viewpage.php?edition=Kutchmitra%20Main&edid=KUTCHMITRA_KUT&date=2021-01-20']

- Chrome webdriver

In [4]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\Sachin\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


 


- Get page sources

In [5]:
page_sources = []
for url in urls:
    driver.get(url)
    more_buttons = driver.find_elements_by_class_name("box")
    for x in range(len(more_buttons)):
      if more_buttons[x].is_displayed():
          driver.execute_script("arguments[0].click();", more_buttons[x])
          time.sleep(1)
    page_src = driver.page_source
    page_sources.append(page_src) 

In [6]:
page_sources

['<html><head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=IE9,chrome=1">\n<title>Kutchmitra epaper-Local to Global News</title>\n<meta name="keywords" content="Read digital edition of Janma Bhoomi Today daily newspaper. Find Janma Bhoomi Today all Newspapers Online including Main Editions, Tabloid and Supplements at Janma Bhoomi Today ePaper Site.">\n<meta name="description" content="Read digital edition of Janma Bhoomi Today daily newspaper. Find Janma Bhoomi Today all Newspapers Online including Main Editions, Tabloid and Supplements at Janma Bhoomi Today ePaper Site.">\n<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=0.5, maximum-scale=2.0, user-scalable=yes">\n<link href="//cjss.enewspapr.com/kutchmitradaily/includes/images/favicon-new.ico" rel="shortcut icon">\n\n\n<script src="https://s0.2mdn.net/instream/video/client.js" async="" type="text/javascript"></script><script src

- Soup from 'lxml' sources

In [7]:
soups = []

for page_source in page_sources:
    soup = bs(page_source, 'lxml')
    soups.append(soup)

- Find all box class from lxml

In [8]:
boxes_list = []
for soup in soups:
    boxes = soup.find_all('a', class_='box')
    boxes_list.append(boxes)

In [9]:
boxes_list

[[<a class="box" data-src="News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1C5B0A6.jpg" id="KUTCHMITRA_KUT_20210118_1_1" style="width:110px;height:91px;left:470px;top:43px;" title="Click here to read article" width="110px"></a>,
  <a class="box" data-src="News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1488F5C.jpg" id="KUTCHMITRA_KUT_20210118_1_10" style="width:110px;height:173px;left:190px;top:425px;" title="Click here to read article" width="110px"></a>,
  <a class="box" data-src="News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1435431.jpg" id="KUTCHMITRA_KUT_20210118_1_11" style="width:179px;height:169px;left:11px;top:425px;" title="Click here to read article" width="179px"></a>,
  <a class="box" data-src="News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1C3C5ED.jpg" id="KUTCHMITRA_KUT_20210118_1_12" style="width:227px;height:161px;left:368px;top:597px;" title="Click here to read article" width="227px"></a>,
  <a class="box" data-src="News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/17EA765.jpg" id=

- Append all data-src links which contain article images

In [10]:
data_src_list = []

for boxes in boxes_list:
    data_src = []
    for box in boxes:
        src = box.get('data-src')
        src = 'https://www.enewspapr.com/' + src
        data_src.append(src)
    data_src_list.append(data_src)

In [11]:
data_src_list

[['https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1C5B0A6.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1488F5C.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1435431.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1C3C5ED.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/17EA765.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1BBADC8.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/151BE06.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1F466FE.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1E03FFD.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1C1FAE9.jpg',
  'https://www.enewspapr.com/News/KUTCHMITRA/KUT/2021/01/18/ArticleImages/1383DCF.jpg',
  'https://www.enewspapr.com/New

- Zip date as key and articles links as list of values 

In [12]:
result =dict(zip(date_str[-5:],data_src_list))

- Create folder and save all article on respective date folder

In [13]:
for date,urls in result.items():
    os.mkdir(date)
    for url in urls:
        filename = date+'/'+url.split("/")[-1]
        r = requests.get(url, stream = True)
        if r.status_code == 200:
            r.raw.decode_content = True
            with open(filename,'wb') as f:
                shutil.copyfileobj(r.raw, f)
            print('Image sucessfully Downloaded: ',filename)
        else:
            print('Image Couldn\'t be retreived')

Image sucessfully Downloaded:  2021-01-18/1C5B0A6.jpg
Image sucessfully Downloaded:  2021-01-18/1488F5C.jpg
Image sucessfully Downloaded:  2021-01-18/1435431.jpg
Image sucessfully Downloaded:  2021-01-18/1C3C5ED.jpg
Image sucessfully Downloaded:  2021-01-18/17EA765.jpg
Image sucessfully Downloaded:  2021-01-18/1BBADC8.jpg
Image sucessfully Downloaded:  2021-01-18/151BE06.jpg
Image sucessfully Downloaded:  2021-01-18/1F466FE.jpg
Image sucessfully Downloaded:  2021-01-18/1E03FFD.jpg
Image sucessfully Downloaded:  2021-01-18/1C1FAE9.jpg
Image sucessfully Downloaded:  2021-01-18/1383DCF.jpg
Image sucessfully Downloaded:  2021-01-18/1BC8522.jpg
Image sucessfully Downloaded:  2021-01-18/1B8AA62.jpg
Image sucessfully Downloaded:  2021-01-18/1A424BF.jpg
Image sucessfully Downloaded:  2021-01-18/167BB79.jpg
Image sucessfully Downloaded:  2021-01-18/1E0874A.jpg
Image sucessfully Downloaded:  2021-01-18/1C5DB16.jpg
Image sucessfully Downloaded:  2021-01-18/1AEA948.jpg
Image sucessfully Downloaded

Image sucessfully Downloaded:  2021-01-19/1EA2E1A.jpg
Image sucessfully Downloaded:  2021-01-19/1A56157.jpg
Image sucessfully Downloaded:  2021-01-19/1A743A6.jpg
Image sucessfully Downloaded:  2021-01-19/164B72D.jpg
Image sucessfully Downloaded:  2021-01-19/1F4A2E3.jpg
Image sucessfully Downloaded:  2021-01-19/1CD5518.jpg
Image sucessfully Downloaded:  2021-01-19/10DAE88.jpg
Image sucessfully Downloaded:  2021-01-19/1E1AAAA.jpg
Image sucessfully Downloaded:  2021-01-19/1796F05.jpg
Image sucessfully Downloaded:  2021-01-19/11C4F8D.jpg
Image sucessfully Downloaded:  2021-01-19/1968613.jpg
Image sucessfully Downloaded:  2021-01-19/155A81B.jpg
Image sucessfully Downloaded:  2021-01-19/12AD6CD.jpg
Image sucessfully Downloaded:  2021-01-19/1ED2987.jpg
Image sucessfully Downloaded:  2021-01-19/1AAD144.jpg
Image sucessfully Downloaded:  2021-01-19/1F274B3.jpg
Image sucessfully Downloaded:  2021-01-19/1B7306C.jpg
Image sucessfully Downloaded:  2021-01-19/177DE35.jpg
Image sucessfully Downloaded

Image sucessfully Downloaded:  2021-01-19/170584C.jpg
Image sucessfully Downloaded:  2021-01-19/15C8078.jpg
Image sucessfully Downloaded:  2021-01-19/10749B3.jpg
Image sucessfully Downloaded:  2021-01-19/1479078.jpg
Image sucessfully Downloaded:  2021-01-19/1AB38E8.jpg
Image sucessfully Downloaded:  2021-01-19/1107183.jpg
Image sucessfully Downloaded:  2021-01-19/19E7266.jpg
Image sucessfully Downloaded:  2021-01-19/18EC8DA.jpg
Image sucessfully Downloaded:  2021-01-19/1B9A17B.jpg
Image sucessfully Downloaded:  2021-01-19/1F4B04A.jpg
Image sucessfully Downloaded:  2021-01-19/1C8085B.jpg
Image sucessfully Downloaded:  2021-01-19/1A30D23.jpg
Image sucessfully Downloaded:  2021-01-19/1DAAFC2.jpg
Image sucessfully Downloaded:  2021-01-19/1CD2C5F.jpg
Image sucessfully Downloaded:  2021-01-19/126D110.jpg
Image sucessfully Downloaded:  2021-01-20/1476EEA.jpg
Image sucessfully Downloaded:  2021-01-20/14047A0.jpg
Image sucessfully Downloaded:  2021-01-20/130623A.jpg
Image sucessfully Downloaded

Image sucessfully Downloaded:  2021-01-20/13D0B34.jpg
