In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1.8_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1.8_2019.pdf'

In [5]:
#Scrape on the pages 1-4 as they have the same schema
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[ "28.858158319870764,731.4398044986758,567.5437802907917,71.53779676581283"],
                          columns=["49.0588691437803,150.06242326332796,188.5399676898223,225.09363489499194,262.60924071082394,292.42933764135705,328.02106623586434,362.6508562197092,393.43289176090474,426.13880452342494,459.8066558966075,494.4364458804524,530.990113085622"],
                          split_text=True,
                          pages="1-4"
                         )

In [6]:
tables

<TableList n=4>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
for each_table in tables:
    clean(each_table)

In [11]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [12]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ["Avalanche","Exposure to Cold","Cyclone"],
    ["Tornado","Tsunami","Earthqauke"],
    ["Epidemic","Flood","Heat/Sun Stroke"],
    ["Landslide","Lightning","Torrential Rain"]
]
len(column_header)

4

In [13]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [14]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [15]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [16]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

#### Uptil page 4 in done <br>Now for Page 5 and Page 6 separately(They have varying schemas)

In [17]:
table5 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=["32.7059127625202,725.668066821712,537.7236833602585,72.4997530453068"],
                          columns=["51.94468497576737,177.95864297253638,227.01751211631665,272.22862681744755,323.21137318255256,363.6127948303716,408.82390953150247,454.0350242326333,492.5125686591277"],
                          split_text=True,
                          pages="5"
                         )

In [18]:
title = ["Forest Fire","Causes Other than above"]
column_list = [
    ("Sl. No.",''),
    ("State/UT",''), 
    (title[0],'Male'),
    (title[0],'Female'),
    (title[0],'Transgender'),
    (title[0],'Total'),
    (title[1],'Male'),
    (title[1],'Female'),
    (title[1],'Transgender'),
    (title[1],'Total'),
]
table5[0].df.columns = pd.MultiIndex.from_tuples(column_list)
table5[0].df.set_index(['Sl. No.','State/UT'],inplace=True)

In [19]:
final = pd.concat([final,table5[0].df],axis=1)

#### Scraping page 6 last

In [20]:
table6 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=["25.972342487883687,712.2006789087965,525.6994507269791,82.11931584024649"],
                          columns=["61.56407108239096,214.512310177706,266.4569951534734,325.13525040387725,376.11799676898227,451.1492084006463"],
                          split_text=True,
                          pages="6"
                         )

In [21]:
title = ["Total"]
column_list = [
    ("Sl. No.",''),
    ("State/UT",''), 
    (title[0],'Male'),
    (title[0],'Female'),
    (title[0],'Transgender'),
    (title[0],'Total'),
    ("Percentage Share",'')
]
table6[0].df.columns = pd.MultiIndex.from_tuples(column_list)
table6[0].df.set_index(['Sl. No.','State/UT'],inplace=True)

## Combining final results to export

In [22]:
final = pd.concat([final,table6[0].df],axis=1)

In [23]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Gender-wise Distribution of Accidental Deaths due to Forces of Nature during 2019 (State & UT-wise).csv",index=False)