In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.5_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.5_2019.pdf'

In [5]:
stateTables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "49.53983844911148,718.9343728652543,553.5956704361874,92.70083491468016"
                              ],
                          columns=[
                              "73.10733441033926,159.68180936995157,191.4257835218094,220.28394184168016,249.14210016155093,280.88607431340876,310.70617124394187,338.6023909531503,369.38442649434575,400.16646203554126,428.0626817447496,458.84471728594514,488.66481421647825,516.5610339256866"
                              ],
                          split_text=True,
                          pages="1,3,5,7"
                         )

In [6]:
cityTables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "33.66785137318256,717.9724165857602,554.0766397415186,68.65192792733092"
                              ],
                          columns=[
                              "54.83050080775445,159.68180936995157,195.27353796445882,224.1316962843296,252.027915993538,279.9241357027464,313.59198707592896,343.41208400646207,372.27024232633283,401.1284006462036,431.91043618739906,460.7685945072698,490.58869143780294,518.4849111470114"
                              ],
                          split_text=True,
                          pages="2,4,6,8"
                         )

In [7]:
stateTables

<TableList n=4>

In [8]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [9]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [10]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [11]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [12]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [13]:
for each_table in stateTables:
    clean(each_table)
for each_table in cityTables:
    clean(each_table)

In [14]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(cityTables)

In [15]:
states = [each.df.copy() for each in stateTables]
cities = [each.df.copy() for each in cityTables]

In [16]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    'Road Accidents',
    'Railway Crossing Accidents',
    'Railway Accidents',
    'Total Traffic Accidents'
]
len(column_header)

4

In [17]:
def set_Column(df, title, state):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(state) else "City",''),
        (title,'Jan.'),
        (title,'Feb.'),
        (title,'Mar.'),
        (title,'Apr.'),
        (title,'May'),
        (title,'Jun.'),
        (title,'Jul.'),
        (title,'Aug.'),
        (title,'Sep.'),
        (title,'Oct.'),
        (title,'Nov.'),
        (title,'Dec.'),
        (title,'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [18]:
for i,(state,city) in enumerate(zip(states,cities)):
    set_Column(state, column_header[i], True)
    state.set_index(["Sl. No.","State/UT"],inplace=True)
    set_Column(city, column_header[i], False)
    city.set_index(["Sl. No.","City"],inplace=True)

In [19]:
# Applying Same Index across all States
for each in states:
    each.index = states[0].index.copy()

In [20]:
State = pd.concat(states, axis=1)
City  = pd.concat(cities, axis=1)

In [21]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [24]:
# Reset Index before saving file, for better formatting in RAW CSV
State.reset_index().to_csv("Month of Occurrence - wise Number of Traffic Accidents during 2019 (StateUT-wise).csv",index=False)
City.reset_index().to_csv("Month of Occurrence - wise Number of Traffic Accidents during 2019 (City-wise).csv",index=False)