In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.9_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.9_2019.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "32.22494345718902,736.2495858961457,569.9486268174476,61.91823397087313"
                              ],
                          columns=[
                              "49.0588691437803,150,191.4257835218094,235.6749596122779,277.0383198707593,327,374,418,465,510.7894022617125"
                              ],
                          split_text=True,
                          pages="1-10"
                         )

In [6]:
tables

<TableList n=10>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
states = [ each.df.copy() for ind,each in enumerate(tables) if(ind%2==0)]
cities = [ each.df.copy() for ind,each in enumerate(tables) if(ind%2==1)]

In [14]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['Dangerous or Careless Driving/Over-taking/etc.','Over Speeding','Driving under Influence of Drug/Alcohol'],
    ['Physical Fatigue of Drivers','Defect in Mechanical Condition of Motor Vehicle','Animal Crossing'],
    ['Weather Condition (Total)','Weather Condition(Poor Visibility)','Weather Condition (Others Causes)'],
    ['Lack of Road Infrastructure','Vehicles Parking at Road Shoulders','Causes Not Known'],
    ['Other Causes','Total Road Accidents','Unmanned Railway Crossing Accidents']
]
len(column_header)

5

In [15]:
def set_Column(df, title, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(not city) else "City",''), 
        (title[0],'Cases'),
        (title[0],'Injured'),
        (title[0],'Died'),
        (title[1],'Cases'),
        (title[1],'Injured'),
        (title[1],'Died'),
        (title[2],'Cases'),
        (title[2],'Injured'),
        (title[2],'Died')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [16]:
for i,(city,state) in enumerate(zip(cities,states)):
    set_Column(cities[i], column_header[i],True)
    cities[i].set_index(["Sl. No.","City"],inplace=True)
    set_Column(states[i], column_header[i],False)
    states[i].set_index(["Sl. No.","State/UT"],inplace=True)

In [17]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [18]:
table11_12 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "34.14882067851374,712.2006789087965,548.7859773828757,84.04322839923444"
                              ],
                          columns=[
                              "63.487948303715676,217.39812600969307,314.5539256865913,437.68206785137323"
                              ],
                              row_tol=10,
                          split_text=True,
                          pages="11-12"
                         )
clean(table11_12[1])

<Table shape=(55, 5)>

In [19]:
def set_Column_last(df, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(not city) else "City",''), 
        ("Grand Total",'Cases'),
        ("Grand Total",'Injured'),
        ("Grand Total",'Died')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [20]:
set_Column_last(table11_12[0].df, False)
table11_12[0].df.set_index(["Sl. No.","State/UT"],inplace=True)
set_Column_last(table11_12[1].df, True)
table11_12[1].df.set_index(["Sl. No.","City"],inplace=True)

In [21]:
cities.append(table11_12[1].df.copy())
states.append(table11_12[0].df.copy())

In [22]:
# Set Uniform Indexes across all respective tables
for city,state in zip(cities,states):
    city.index  = cities[0].index.copy()
    state.index = states[0].index.copy()

In [23]:
City  = pd.concat(cities,axis=1)
State = pd.concat(states,axis=1)

In [24]:
City

Unnamed: 0_level_0,Unnamed: 1_level_0,Dangerous or Careless Driving/Over-taking/etc.,Dangerous or Careless Driving/Over-taking/etc.,Dangerous or Careless Driving/Over-taking/etc.,Over Speeding,Over Speeding,Over Speeding,Driving under Influence of Drug/Alcohol,Driving under Influence of Drug/Alcohol,Driving under Influence of Drug/Alcohol,Physical Fatigue of Drivers,...,Other Causes,Total Road Accidents,Total Road Accidents,Total Road Accidents,Unmanned Railway Crossing Accidents,Unmanned Railway Crossing Accidents,Unmanned Railway Crossing Accidents,Grand Total,Grand Total,Grand Total
Unnamed: 0_level_1,Unnamed: 1_level_1,Cases,Injured,Died,Cases,Injured,Died,Cases,Injured,Died,Cases,...,Died,Cases,Injured,Died,Cases,Injured,Died,Cases,Injured,Died
Sl. No.,City,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
37.0,AGRA,208,84,222,84,22,94,0,0,0,0,...,0,292,106,316,0,0,0,292,106,316
38.0,AHMEDABAD,1,1,0,1374,1142,434,1,1,1,3,...,0,1380,1148,439,0,0,0,1380,1148,439
39.0,ALLAHABAD,107,62,26,203,86,36,103,44,32,11,...,46,776,523,283,0,0,0,776,523,283
40.0,AMRITSAR,16,8,16,99,55,66,3,3,4,0,...,0,133,76,95,0,0,0,133,76,95
41.0,ASANSOL,429,343,301,5,4,4,0,0,0,0,...,0,442,354,314,0,0,0,442,354,314
42.0,AURANGABAD,8,5,3,472,340,210,13,7,6,0,...,1,516,372,226,0,0,0,516,372,226
43.0,BENGALURU,438,389,54,4143,3707,691,32,33,7,0,...,16,4684,4250,768,0,0,0,4684,4250,768
44.0,BHOPAL,1532,1084,77,196,160,10,0,0,0,0,...,19,2516,1909,123,0,0,0,2516,1909,123
45.0,CHANDIGARH (CITY),14,8,5,282,259,99,8,8,0,0,...,0,304,275,104,0,0,0,304,275,104
46.0,CHENNAI,645,830,136,3043,2651,668,438,469,49,937,...,0,6871,6702,1252,0,0,0,6871,6702,1252


In [25]:
State

Unnamed: 0_level_0,Unnamed: 1_level_0,Dangerous or Careless Driving/Over-taking/etc.,Dangerous or Careless Driving/Over-taking/etc.,Dangerous or Careless Driving/Over-taking/etc.,Over Speeding,Over Speeding,Over Speeding,Driving under Influence of Drug/Alcohol,Driving under Influence of Drug/Alcohol,Driving under Influence of Drug/Alcohol,Physical Fatigue of Drivers,...,Other Causes,Total Road Accidents,Total Road Accidents,Total Road Accidents,Unmanned Railway Crossing Accidents,Unmanned Railway Crossing Accidents,Unmanned Railway Crossing Accidents,Grand Total,Grand Total,Grand Total
Unnamed: 0_level_1,Unnamed: 1_level_1,Cases,Injured,Died,Cases,Injured,Died,Cases,Injured,Died,Cases,...,Died,Cases,Injured,Died,Cases,Injured,Died,Cases,Injured,Died
Sl. No.,State/UT,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
,STATES,,,,,,,,,,,...,,,,,,,,,,
1.0,ANDHRA PRADESH,5899.0,6807.0,2277.0,12866.0,14570.0,5123.0,77.0,111.0,41.0,9.0,...,276.0,20677.0,24619.0,7984.0,0.0,0.0,0.0,20677.0,24619.0,7984.0
2.0,ARUNACHAL PRADESH,59.0,49.0,23.0,76.0,97.0,41.0,1.0,4.0,3.0,0.0,...,20.0,200.0,275.0,109.0,0.0,0.0,0.0,200.0,275.0,109.0
3.0,ASSAM,1775.0,1577.0,823.0,4440.0,3855.0,1780.0,261.0,223.0,108.0,72.0,...,49.0,8055.0,7081.0,3245.0,0.0,0.0,0.0,8055.0,7081.0,3245.0
4.0,BIHAR,2450.0,1630.0,1862.0,4766.0,3510.0,3387.0,261.0,259.0,92.0,109.0,...,98.0,10007.0,7206.0,7205.0,23.0,0.0,23.0,10030.0,7206.0,7228.0
5.0,CHHATTISGARH,2604.0,2454.0,1131.0,8928.0,8657.0,2990.0,267.0,214.0,109.0,26.0,...,276.0,13899.0,13089.0,5003.0,0.0,0.0,0.0,13899.0,13089.0,5003.0
6.0,GOA,1594.0,712.0,139.0,1782.0,728.0,156.0,22.0,7.0,2.0,4.0,...,0.0,3440.0,1460.0,299.0,0.0,0.0,0.0,3440.0,1460.0,299.0
7.0,GUJARAT,1821.0,1691.0,972.0,13338.0,12916.0,5739.0,94.0,86.0,41.0,235.0,...,60.0,16503.0,15976.0,7428.0,0.0,0.0,0.0,16503.0,15976.0,7428.0
8.0,HARYANA,3541.0,2906.0,1766.0,4744.0,4052.0,2234.0,155.0,142.0,55.0,42.0,...,444.0,10937.0,9247.0,5269.0,0.0,0.0,0.0,10937.0,9247.0,5269.0
9.0,HIMACHAL PRADESH,1478.0,2453.0,514.0,1141.0,1703.0,445.0,78.0,152.0,47.0,8.0,...,8.0,2896.0,4740.0,1124.0,0.0,0.0,0.0,2896.0,4740.0,1124.0


In [26]:
# Reset Index before saving file, for better formatting in RAW CSV
State.reset_index().to_csv("Cause – wise Distribution of Road Accidents and Unmanned Railway Crossing Accidents during 2019(StateUT-wise).csv",index=False)
City.reset_index().to_csv("Cause – wise Distribution of Road Accidents and Unmanned Railway Crossing Accidents during 2019(City-wise).csv",index=False)