In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.4_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.4_2019.pdf'

In [5]:
stateTables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "21.643618739903072,734.3256733371578,568.0247495961229,69.61388420682489"
                              ],
                          columns=[
                              "49.0588691437803,154.87211631663976,193.3496607431341,238.56077544426498,277.0383198707593,321.2874959612278,359.7650403877222,412.6716639741519,466.540226171244,509.82746365105015"
                              ],
                          split_text=True,
                          pages="1,3,5,7,9"
                         )

In [6]:
cityTables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "21.643618739903072,736.2495858961457,569.9486268174476,62.8801902503671"
                              ],
                          columns=[
                              "50.982746365105015,161.60568659127628,200.08323101777063,240.4846526655897,278.962197092084,326.0971890145396,369.38442649434575,415.557479806139,464.61634894991926,512.7132794830372"
                              ],
                          split_text=True,
                          pages="2,4,6,8,10"
                         )

In [7]:
stateTables

<TableList n=5>

In [8]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [9]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [10]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [11]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [12]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [13]:
for each_table in stateTables:
    clean(each_table)

In [14]:
for each_table in cityTables:
    clean(each_table)

In [15]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(stateTables)

In [16]:
states = [each.df.copy() for each in stateTables]
cities = [each.df.copy() for each in cityTables]

In [17]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['Truck/Lorry','Bus','SUV/Station Wagon/etc'],
    ['Car','Jeep','Tractor'],
    ['Three Wheeler/Auto Rickshaw','Two Wheeler','Other Motor Vehicles'],
    ['Bicycle','Hand Drawn Vehicle/Cycle Rickshaw','Animal Drawn Vehicle'],
    ['Pedestrian','Others','Grand Total']
]
len(column_header)

5

In [18]:
def set_Column(df, title, state):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(state) else "City",''),
        (title[0],'Offenders'),
        (title[0],'Victims'),
        (title[0],'Total'),
        (title[1],'Offenders'),
        (title[1],'Victims'),
        (title[1],'Total'),
        (title[2],'Offenders'),
        (title[2],'Victims'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [19]:
for i,(state,city) in enumerate(zip(states,cities)):
    set_Column(state, column_header[i], True)
    state.set_index(["Sl. No.","State/UT"],inplace=True)
    set_Column(city, column_header[i], False)
    city.set_index(["Sl. No.","City"],inplace=True)

In [20]:
# Applying Same Index across all tables
for each in states:
    each.index = states[0].index.copy()
for each in cities:
    each.index = cities[0].index.copy()

In [21]:
State = pd.concat(states, axis=1)
City  = pd.concat(cities, axis=1)

In [22]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [23]:
# Reset Index before saving file, for better formatting in RAW CSV
State.reset_index().to_csv("Mode of Transport – wise Number of Persons Died in Road Accidents during 2019(StateUT-wise).csv",index=False)
City.reset_index().to_csv("Mode of Transport – wise Number of Persons Died in Road Accidents during 2019(City-wise).csv",index=False)