In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.3_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.3_2019.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "45.97415185783522,667.8189815442427,580.8562197092084,64.71056134575319"
                              ],
                          columns=[
                              "67.23101777059773,209.60258481421647,263.9806138933764,321.3247172859451,369.77059773828756,425.1373182552504,475.5605815831987,524.0064620355412"
                              ],
                          split_text=True,
                          row_tol=13,
                          strip_text='\n',
                          pages="all"
                         )

In [6]:
tables

<TableList n=1>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
      0                                    1      2       3      4       5  \
0   1.1                  Truck/Lorry (Total)   6643   13352  15994   29228   
1                1.1.1 Normal Goods Carriers   4520    9651  10553   21055   
2           1.1.2 Trailer/Container Carriers    888    1437   2107    3230   
3                              1.1.3 Tankers    473     747   1110    1741   
4                               1.1.4 Others    762    1517   2224    3202   
5   1.2                          Bus (Total)   2326    7893   6866   27155   
6                           1.2.1 Government    483    1952   1924    7100   
7                              1.2.2 Private   1769    5716   4654   19195   
8                           1.2.3 School Bus     74     225    288     860   
9   1.3       SUV/Station Wagon/etc. (Total)   1835    4117   4353   10326   
10                          1.3.1 Government     35     105     96     187   
11                             1.3.2 Private   1541 

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
split_by_first_space(tables[0].df,1,4)
split_by_first_space(tables[0].df,6,8)
split_by_first_space(tables[0].df,10,12)
split_by_first_space(tables[0].df,14,16)
split_by_first_space(tables[0].df,18,19)
split_by_first_space(tables[0].df,22,23)
split_by_first_space(tables[0].df,25,27)

In [12]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
        0                                    1      2       3      4       5  \
0     1.1                  Truck/Lorry (Total)   6643   13352  15994   29228   
1   1.1.1                Normal Goods Carriers   4520    9651  10553   21055   
2   1.1.2           Trailer/Container Carriers    888    1437   2107    3230   
3   1.1.3                              Tankers    473     747   1110    1741   
4   1.1.4                               Others    762    1517   2224    3202   
5     1.2                          Bus (Total)   2326    7893   6866   27155   
6   1.2.1                           Government    483    1952   1924    7100   
7   1.2.2                              Private   1769    5716   4654   19195   
8   1.2.3                           School Bus     74     225    288     860   
9     1.3       SUV/Station Wagon/etc. (Total)   1835    4117   4353   10326   
10  1.3.1                           Government     35     105     96     187   
11  1.3.2                   

In [14]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    "Air Crash",
    "Ship Accidents",
    "Collapse of Structure(Total)",
    "Collapse of Structure (Dwelling House/Residential building)",
    "Collapse of Structure (Official/Commercial Building)",
    "Collapse of Structure (Dam)",
    "Collapse of Structure (Bridge)",
    "Collapse of Structure (Others)",
    "Drowning (Total)",
    "Drowning (Boat Capsize)",
    "Drowning (Accidental Falls into Water body)",
    "Drowning (Other Cases)",
    "Electrocution",
    "Accidental Explosion (Total)",
    "Accidental Explosion (Domestic Gas Cylinder)",
    "Accidental Explosion (Industrial Boiler/Gas Cylinder Explosion)",
    "Accidental Explosion (Ammunition Explosion in Armed Forces/Police/CPMF)",
    "Accidental Explosion (Other)",
    "Falls (Total)",
    "Falls (from Height)",
    "Falls (from Vehicles (Automobile like Bus, Trucks, etc.))",
    "Falls (into Manhole)",
    "Falls (into Pit)",
    "Falls (into Borewell)",
    "Falls (Others)",
    "Factory / Machine Accidents",
    "Accidental Fire (Total)",
    "Accidental Fire (Electrical Short Circuit",
    "Accidental Fire (Riot/Agitation)",
    "Accidental Fire (Fireworks)",
    "Accidental Fire (Cooking Gas Cylinder/Stove Burst)",
    "Accidental Fire (Other Causes)",
    "Firearm",
    "Mines or Quarry Disaster",
    "Traffic Accidents (Total)",
    "Road Accidents",
    "Railway Crossing Accidents",
    "Railway Accidents",
    "Stampede",
    "Sudden Deaths (Total)",
    "Sudden Deaths (Heart Attack)",
    "Sudden Deaths (Others)",
    "Deaths of Women during Pregnancy (Total)",
    "Deaths of Women during Pregnancy (Due to Abortions)",
    "Deaths of Women during Pregnancy (Other than Abortions)",
    "Deaths due to Consumption of Illicit/Spurious Liquor",
    "Killed / Injured by Animals",
    "Poisoning (Total)",
    "Poisoning (Food Poisoning)",
    "Poisoning (Accidental Intake of Insecticides/Pesticides)",
    "Poisoning [Poisonous Gases (Total)]",
    "Poisoning [Poisonous Gases (Carbon Monoxide (CO) Gas)]",
    "Poisoning [Poisonous Gases (Other poisonous Gases)]",
    "Poisoning (Snake Bite)",
    "Poisoning (Animal/Reptiles/Insects Bite)",
    "Poisoning (Other)",
    "Suffocation",
    "Drug Overdose",
    "Other than above Causes",
    "Causes Not Known",
    "Total"
]
len(column_header)

61

In [15]:
def set_Column(df):
    column_list = [
        ("Sl. No.",''),
        ("Mode Of Transport",''), 
        ('No. of Offending Driver/Pedestrian','Died'),
        ('No. of Offending Driver/Pedestrian','Injured'),
        ('No. of Victims','Died'),
        ('No. of Victims','Injured'),
        ('Total Persons Injured',''),
        ('Total Persons Died',''),
        ('Percentage Share of Deaths','')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [16]:
for i in range(len(tables)):
    set_Column(tables[i].df)

In [17]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
tables[0].df

Unnamed: 0_level_0,Sl. No.,Mode Of Transport,No. of Offending Driver/Pedestrian,No. of Offending Driver/Pedestrian,No. of Victims,No. of Victims,Total Persons Injured,Total Persons Died,Percentage Share of Deaths
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Died,Injured,Died,Injured,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1.1,Truck/Lorry (Total),6643,13352,15994,29228,42580,22637,14.6
1,1.1.1,Normal Goods Carriers,4520,9651,10553,21055,30706,15073,9.7
2,1.1.2,Trailer/Container Carriers,888,1437,2107,3230,4667,2995,1.9
3,1.1.3,Tankers,473,747,1110,1741,2488,1583,1.0
4,1.1.4,Others,762,1517,2224,3202,4719,2986,1.9
5,1.2,Bus (Total),2326,7893,6866,27155,35048,9192,5.9
6,1.2.1,Government,483,1952,1924,7100,9052,2407,1.6
7,1.2.2,Private,1769,5716,4654,19195,24911,6423,4.2
8,1.2.3,School Bus,74,225,288,860,1085,362,0.2
9,1.3,SUV/Station Wagon/etc. (Total),1835,4117,4353,10326,14443,6188,4.0


In [19]:
# Reset Index before saving file, for better formatting in RAW CSV
tables[0].df.to_csv("Mode of Transport – wise Number of Persons Died in Road Accidents during 2019 (All India).csv",index=False)