# **Lok Sabha Data Scraper**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install tika

Collecting tika
  Downloading tika-1.24.tar.gz (28 kB)
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-1.24-py3-none-any.whl size=32891 sha256=77a60c4988cc2fded2b32b5fdf8ed89b3b2eeafddb90ce3e13cd20705a44111d
  Stored in directory: /root/.cache/pip/wheels/ec/2b/38/58ff05467a742e32f67f5d0de048fa046e764e2fbb25ac93f3
Successfully built tika
Installing collected packages: tika
Successfully installed tika-1.24


In [None]:
# module imports

import os
import requests
import numpy as np
import multiprocessing
from tika import parser
from bs4 import BeautifulSoup

### **Downloading all the pdfs from the Lok Sabha Digital Library**

In [None]:
# a function to scrape the lok sabha digital library and download the pdfs

def scrape(path):

  # extracting folder name and creating a folder in drive
  folderName = path[-2:]

  if folderName in os.listdir("/content/drive/MyDrive/Lok Sabha Data"):
    print(folderName, " already present")
    
  else:
    folderPath = "/content/drive/MyDrive/Lok Sabha Data/" + folderName + "/"
    os.makedirs(folderPath)

    url = path
    links = []

    while url != None:
      req = requests.get(url)
      soup = BeautifulSoup(req.content, 'html.parser')

      # trying to fetch all the intermediate urls to the target pdfs
      table = soup.find('table', attrs = {'summary':'This table browses all dspace content'})
      for row in table.findAll('td', attrs = {'headers':'t4'}):
        if(row.a != None):
          link = row.a['href']
          links.append("https://eparlib.nic.in"+link)
      
      url = soup.find('a', attrs = {'class' : 'pull-right'})
      if url != None:
        url = "https://eparlib.nic.in" + url['href']
    
    # get to the pdf from here on
    for number,link in enumerate(links):
      req = requests.get(link)
      soup = BeautifulSoup(req.content, 'html.parser')

      table = soup.find('table', attrs = {'class' : 'table panel-body'})
      for row in table.findAll('td', attrs = {'align':'center'}):
        if(row.a != None):
          pdfLink = row.a['href']
          pdfLink = "https://eparlib.nic.in" + pdfLink

          # extracting and saving the pdfs
          try:
            pdfReq = requests.get(pdfLink)
            pdfName = str(number+1)
            pdfPath = folderPath + pdfName + ".pdf"
            with open(pdfPath, 'wb') as f:
              f.write(pdfReq.content)

          except requests.exceptions.HTTPError as errh:
              print("Http Error:",errh)
              print("Number : ", pdfName)
          except requests.exceptions.ConnectionError as errc:
              print("Error Connecting:",errc)
              print("Number : ", pdfName)
          except requests.exceptions.Timeout as errt:
              print("Timeout Error:",errt)
              print("Number : ", pdfName)
          except requests.exceptions.RequestException as err:
              print("OOps: Something Else",err)
              print("Number : ", pdfName)

          

    print("Done with ", folderName)



if __name__ == '__main__':
    
  pool = multiprocessing.Pool()
  pool = multiprocessing.Pool(processes=2)
  
  paths = [
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=01",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=02",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=03",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=04",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=05",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=06",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=07",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=08",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=09",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=10",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=11",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=12",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=13",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=14",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=15",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=16",
         "https://eparlib.nic.in/handle/123456789/6/browse?type=loksabhanumber&sort_by=1&order=DESC&rpp=20&value=17"
  ]

  pool.map(scrape, paths)

01  already present
02  already present
04  already present
03  already present
07  already present
08  already present
09  already present
10  already present
11  already present
12  already present
13  already present
14  already present
15  already present
16  already present
17  already present
Error Connecting: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Number :  258
Error Connecting: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Number :  260
Error Connecting: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Number :  270
Done with  05
06  already present
