In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1.4_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1.4_2019.pdf'

## What the structure of the final DataFrame is supposed to look like after stitching(concatinating)?

| tableA   | tableB   | tableC   |
| -------- | -------- | -------- |
| Page1    | Page3    | Page5    |
| Page2    | Page4    | Page6    |

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,739.1354547346276,569.4676575121164,52.29867117593344"
                          ],
                          columns=[
                              "49.0588691437803,149.1004846526656,183.73027463651053,222.20781906300488,258.7614862681745,289.54352180936996,326.0971890145396,360.72697899838454,387.66126009693056,425.1768659127626,459.8066558966075,498.28420032310186,530.0281744749597"
                          ],
                          split_text=True,
                          pages="1-4"
                         )

In [6]:
tables5_6 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,732.0169841376875,556.4814862681745,69.61388420682489"
                          ],
                          columns=[
                              "61.56407108239096,239.52271405492732,318.40168012924073,394.3948303715671,470.38798061389343"
                          ],
                          split_text=True,
                          pages="5-6"
                         )

In [7]:
tables

<TableList n=4>

In [8]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [9]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)
printnAllTables(tables5_6)

Page No:  1
         0                                 1       2       3            4  \
0                                               Male  Female  Transgender   
1      (1)                               (2)    (27)    (28)         (29)   
2                                     STATES                                
3        1                    ANDHRA PRADESH   15002    2934            2   
4        2                 ARUNACHAL PRADESH     236      39            0   
5        3                             ASSAM    4495     941            0   
6        4                             BIHAR   12032    3179            0   
7        5                      CHHATTISGARH   14638    5150            1   
8        6                               GOA     685     110            0   
9        7                           GUJARAT   19717    4191            2   
10       8                           HARYANA   13271    2367            1   
11       9                  HIMACHAL PRADESH    2724     532    

In [10]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while( (table.df.iloc[0,1] != 'STATES') and (table.df.iloc[0,1] != 'AGRA') ):
        table.df = table.df.iloc[1:]
    while( (table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)') and (table.df.iloc[-1,1] != 'TOTAL (CITIES)') ):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [11]:
for each_table in tables:
    clean(each_table)
for each_table in tables5_6:
    clean(each_table)

In [12]:
df1 = tables[0].df.copy()
df2 = tables[1].df.copy()
df3 = tables[2].df.copy()
df4 = tables[3].df.copy()
df5 = tables5_6[0].df.copy()
df6 = tables5_6[1].df.copy()

In [13]:
tableA = pd.concat([df1,df2])
tableB = pd.concat([df3,df4])
tableC = pd.concat([df5,df6])

In [14]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [15]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ["Below 14 years","14 and Above – Below 18 years","18 and Above – Below 30 years"],
    ["30 and Above – Below 45 years","45 and Above–Below 60 years","60 years & Above"]
]
len(column_header)

2

In [16]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",'',''),
        ("State/UT",'',''), 
        (title[0],'No. of Persons Injured','Male'),
        (title[0],'No. of Persons Injured','Female'),
        (title[0],'No. of Persons Injured','Transgender'),
        (title[0],'No. of Persons Injured','Total'),
        (title[1],'No. of Persons Died','Male'),
        (title[1],'No. of Persons Died','Female'),
        (title[1],'No. of Persons Died','Transgender'),
        (title[1],'No. of Persons Died','Total'),
        (title[2],'No. of Persons Died','Male'),
        (title[2],'No. of Persons Died','Female'),
        (title[2],'No. of Persons Died','Transgender'),
        (title[2],'No. of Persons Died','Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [17]:
# TableC not included here as it has a different column format, it will be done sperately
tables = [tableA,tableB]

In [18]:
for i in range(len(tables)):
    set_Column(tables[i], column_header[i])
    tables[i].set_index(["Sl. No.","State/UT"],inplace=True)

In [19]:
column_list = [
    ("Sl. No.",'',''),
    ("State/UT",'',''), 
    ("Total",'No. of Persons Injured','Male'),
    ("Total",'No. of Persons Injured','Female'),
    ("Total",'No. of Persons Injured','Transgender'),
    ("Total",'No. of Persons Injured','Total')
]
tableC.columns = pd.MultiIndex.from_tuples(column_list)
tableC.set_index(["Sl. No.","State/UT"],inplace=True)

In [20]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice

In [21]:
# Convert List of tables to list of DataFrames
DataFrames = [tableA,tableB,tableC]
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [22]:
final

Unnamed: 0_level_0,Unnamed: 1_level_0,Below 14 years,Below 14 years,Below 14 years,Below 14 years,14 and Above – Below 18 years,14 and Above – Below 18 years,14 and Above – Below 18 years,14 and Above – Below 18 years,18 and Above – Below 30 years,18 and Above – Below 30 years,...,45 and Above–Below 60 years,45 and Above–Below 60 years,60 years & Above,60 years & Above,60 years & Above,60 years & Above,Total,Total,Total,Total
Unnamed: 0_level_1,Unnamed: 1_level_1,No. of Persons Injured,No. of Persons Injured,No. of Persons Injured,No. of Persons Injured,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,...,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Died,No. of Persons Injured,No. of Persons Injured,No. of Persons Injured,No. of Persons Injured
Unnamed: 0_level_2,Unnamed: 1_level_2,Male,Female,Transgender,Total,Male,Female,Transgender,Total,Male,Female,...,Transgender,Total,Male,Female,Transgender,Total,Male,Female,Transgender,Total
Sl. No.,State/UT,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
,STATES,,,,,,,,,,,...,,,,,,,,,,
1,ANDHRA PRADESH,326,143,0,469,544,137,0,681,3725,530,...,0,4413,1602,551,0,2153,15002,2934,2,17938
2,ARUNACHAL PRADESH,10,9,0,19,7,4,0,11,91,14,...,0,38,3,1,0,4,236,39,0,275
3,ASSAM,135,67,0,202,348,147,0,495,1574,327,...,0,919,208,30,0,238,4495,941,0,5436
4,BIHAR,1136,420,0,1556,1903,536,0,2439,3942,834,...,0,2030,700,236,0,936,12032,3179,0,15211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,VARANASI,1,0,0,1,6,2,0,8,55,9,...,0,89,84,26,0,110,283,70,0,353
87,VASAI VIRAR,54,38,0,92,21,9,0,30,171,59,...,0,265,90,59,0,149,867,257,0,1124
88,VIJAYAWADA,6,1,0,7,6,1,0,7,113,5,...,0,132,64,24,0,88,456,77,1,534
89,VISHAKHAPATNAM,12,11,0,23,24,6,0,30,183,42,...,0,230,60,25,0,85,686,188,0,874


In [23]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Age and Gender-wise Distribution of Total Accidental Deaths during 2019.csv",index=False)