In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1.6_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1.6_2019.pdf'

In [5]:
#Scraping All Pages 
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=["15.39101777059774,712.777858547808,584.8586752827141,107.70735874610126"],
                          columns=["38.47754442649435,171.22507269789986,212.5884329563813,256.83760904684976,296.2770920840065,335.7165751211632,372.27024232633283,399.20452342487886,437.68206785137323,476.1596122778676,514.6371567043619,543.4953150242327"],
                          split_text=True,
                          row_tol=12,
                          strip_text='\n\t',
                          pages="1-2"
                         )

In [6]:
tables

<TableList n=2>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
         0                                                  1       2   \
0   Sl. No.                                              Cause           
1                                                                 2018   
2       (1)                                                (2)     (3)   
3         1                                          Air Crash       3   
4         2                                     Ship Accidents       0   
5         3                      Collapse of Structure (Total)    1953   
6            3.1 Collapse of Dwelling     House/Residential...    1201   
7            3.2 Collapse of Official/      Commercial Buil...      47   
8                                          3.3 Collapse of Dam       4   
9                                      3.4 Collapse of  Bridge      17   
10                                                  3.5 Others     684   
11        4                                   Drowning (Total)   29696   
12                        

In [9]:
# Removes Garbage Rows that may have been detected Before the 'Air Crash' or 'Sudden Deaths (Total)' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL' or 'Stampede' row
def clean(table):
    while(table.df.iloc[0,1] not in  ['Air Crash','Sudden Deaths (Total)']):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] not in  ['Total', 'Stampede']):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
for each_table in tables:
    clean(each_table)

In [11]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [12]:
df1 = tables[0].df
df2 = tables[1].df

In [13]:
tableA = pd.concat([df1,df2],axis=0)

In [14]:
tableA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
3,1,Air Crash,3,2,-33.3,0,0,0,0,11,1,0,12
4,2,Ship Accidents,0,0,-,0,0,0,0,0,0,0,0
5,3,Collapse of Structure (Total),1953,1866,-4.5,134,64,2,200,1363,566,0,1929
6,,3.1 Collapse of Dwelling House/Residential...,1201,1218,1.4,103,43,0,146,867,374,0,1241
7,,3.2 Collapse of Official/ Commercial Buil...,47,89,89.4,7,3,0,10,89,15,0,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,19,Suffocation,1896,1584,-16.5,16,9,0,25,1208,390,0,1598
21,20,Drug Overdose,895,729,-18.5,33,1,0,34,594,110,0,704
22,21,Other than above Causes,59042,59321,0.5,266,73,0,339,45616,12953,7,58576
23,22,Causes Not Known,14574,17524,20.2,117,39,0,156,13212,3452,2,16666


In [15]:
# Interate through entire IndexList to get the rows whose States need to be split
# e.g 
# Sl. No      State
#             7.1 Fall from Height
#         to
# Sl. No      State
# 7.1         Fall from Height

i = 0
for index, each in tableA.iterrows():
    print(i, each[1])
    i+=1

0 Air Crash
1 Ship Accidents
2 Collapse of Structure (Total)
3 3.1 Collapse of Dwelling     House/Residential Building
4 3.2 Collapse of Official/      Commercial Building
5 3.3 Collapse of Dam
6 3.4 Collapse of  Bridge
7 3.5 Others
8 Drowning (Total)
9 4.1 Boat Capsize
10 4.2 Accidental Falls into      Waterbody
11 4.3 Other Cases
12 Electrocution
13 Accidental Explosion (Total)
14 6.1 Domestic Gas Cylinder
15 6.2 Industrial Boiler/      Gas Cylinder Explosion
16 6.3 Ammunition Explosion in       Armed Forces/Police/CPMF
17 6.4 Other
18 Falls (Total)
19 7.1 Fall from Height
20 7.2 Fall from Vehicles      (Automobile like Bus,      Trucks, etc.)
21 7.3 Fall into Manhole
22 7.4 Fall into Pit
23 7.5 Fall into Borewell
24 7.6 Others
25 Factory/Machine Accidents
26 Accidental Fire (Total)
27 9.1 Electrical Short circuit
28 9.2 Riot/Agitation
29 9.3 Fireworks
30 9.4 Cooking Gas Cylinder/       Stove Burst
31 9.5 Other Causes
32 Firearm
33 Mines or Quarry Disaster
34 Traffic Accidents (Total

Such index ranges are:
1. 3-7
2. 9-11
3. 14-17
4. 19-24
5. 27-31
6. 35-37
7. 40-41
8. 43-44
9. 48-55

In [16]:
final = tableA.copy()

In [17]:
def split_by_first_space(DF, strIndex, stpIndex):
    df = DF.iloc[strIndex:stpIndex+1].copy()
    for index, each in df.iterrows():
            space = each[1].find(' ')
            i = each[1][:space]
            val = each[1][space:]
            each[0] = i
            each[1] = val
    DF[strIndex:stpIndex+1] = df.copy()
        
split_by_first_space(final,3,7)
split_by_first_space(final,9,11)
split_by_first_space(final,14,17)
split_by_first_space(final,19,24)
split_by_first_space(final,27,31)
split_by_first_space(final,35,37)
split_by_first_space(final,40,41)
split_by_first_space(final,43,44)
split_by_first_space(final,48,55)

In [18]:
print(final.iloc[:,0:2].to_string())

         0                                                                  1
3        1                                                          Air Crash
4        2                                                     Ship Accidents
5        3                                      Collapse of Structure (Total)
6      3.1                Collapse of Dwelling     House/Residential Building
7      3.2                     Collapse of Official/      Commercial Building
8      3.3                                                    Collapse of Dam
9      3.4                                                Collapse of  Bridge
10     3.5                                                             Others
11       4                                                   Drowning (Total)
12     4.1                                                       Boat Capsize
13     4.2                               Accidental Falls into      Waterbody
14     4.3                                                      

In [19]:
# To Rename the columns accordingly
ColumnList = [
    ("Sl. No.",''),
    ("State/UT",''), 
    ('No. of Cases','2018'),
    ('No. of Cases','2019'),
    ('% Variation in 2019 over 2018',''),
    ('Persons Injured – 2019','Male'),
    ('Persons Injured – 2019','Female'),
    ('Persons Injured – 2019','Transgender'),
    ('Persons Injured – 2019','Total'),
    ('Persons Died – 2019','Male'),
    ('Persons Died – 2019','Female'),
    ('Persons Died – 2019','Transgender'),
    ('Persons Died – 2019','Total')
]
final.columns = pd.MultiIndex.from_tuples(ColumnList)
final.set_index(["Sl. No.","State/UT"],inplace=True)

In [20]:
final

Unnamed: 0_level_0,Unnamed: 1_level_0,No. of Cases,No. of Cases,% Variation in 2019 over 2018,Persons Injured – 2019,Persons Injured – 2019,Persons Injured – 2019,Persons Injured – 2019,Persons Died – 2019,Persons Died – 2019,Persons Died – 2019,Persons Died – 2019
Unnamed: 0_level_1,Unnamed: 1_level_1,2018,2019,Unnamed: 4_level_1,Male,Female,Transgender,Total,Male,Female,Transgender,Total
Sl. No.,State/UT,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,Air Crash,3,2,-33.3,0,0,0,0,11,1,0,12
2,Ship Accidents,0,0,-,0,0,0,0,0,0,0,0
3,Collapse of Structure (Total),1953,1866,-4.5,134,64,2,200,1363,566,0,1929
3.1,Collapse of Dwelling House/Residential Building,1201,1218,1.4,103,43,0,146,867,374,0,1241
3.2,Collapse of Official/ Commercial Building,47,89,89.4,7,3,0,10,89,15,0,104
...,...,...,...,...,...,...,...,...,...,...,...,...
19,Suffocation,1896,1584,-16.5,16,9,0,25,1208,390,0,1598
20,Drug Overdose,895,729,-18.5,33,1,0,34,594,110,0,704
21,Other than above Causes,59042,59321,0.5,266,73,0,339,45616,12953,7,58576
22,Causes Not Known,14574,17524,20.2,117,39,0,156,13212,3452,2,16666


In [21]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Other Causes-wise Number of Cases, Persons Injured & Persons Died during 2019 (State & UT-wise).csv",index=False)