In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.11_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.11_2019.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "28.37718901453958,706.4289412318327,578.6060743134088,52.29867117593344"
                              ],
                          columns=[
                              "49.0588691437803,146.21466882067853,184.69221324717287,220.28394184168016,254.91373182552508,289.54352180936996,328.02106623586434,365.5366720516963,394.3948303715671,429.98655896607437,462.69247172859457,501.1700161550889,535.7998061389338"
                              ],
                          split_text=True,
                          pages="1-10"
                         )

In [6]:
tables

<TableList n=10>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
states = [ each.df.copy() for ind,each in enumerate(tables) if(ind%2==0)]
cities = [ each.df.copy() for ind,each in enumerate(tables) if(ind%2==1)]

In [14]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [15]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['Rural Area(Near School/College/Educational Institution)','Rural Area(Near Residential Area)','Rural Area(Near Religious Place)'],
    ['Rural Area(Near Recreation Place/Cinema Hall)','Rural Area(Near Factory)','Rural Area(Others)'],
    ['Rural Area(Sub Total)','Urban Area(Near School/College/Educational Institution)','Urban Area(Near Residential Area'],
    ['Urban Area(Near Religious Place)','Urban Area(Near Recreation Place/Cinema Hall)','Urban Area(Near Factory/Industrial Area)'],
    ['Urban Area(At Pedestrian Crossing)','Urban Area(Others)','Urban Area(Sub Total)']
]
len(column_header)

5

In [16]:
def set_Column(df, title, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(not city) else "City",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [17]:
for i,(city,state) in enumerate(zip(cities,states)):
    set_Column(cities[i], column_header[i],True)
    cities[i].set_index(["Sl. No.","City"],inplace=True)
    set_Column(states[i], column_header[i],False)
    states[i].set_index(["Sl. No.","State/UT"],inplace=True)

In [18]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [19]:
table11_12 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "37.99657512116317,718.9343728652543,558.4053634894992,91.73887863518618"
                              ],
                          columns=[
                              "60.6021324717286,205.85486268174478,296.2770920840065,379.9657512116317,462.69247172859457"
                              ],
                          split_text=True,
                          pages="11-12"
                         )
clean(table11_12[1])

<Table shape=(55, 6)>

In [20]:
def set_Column_last(df, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(not city) else "City",''), 
        ("Grand Total",'Male'),
        ("Grand Total",'Female'),
        ("Grand Total",'Transgender'),
        ("Grand Total",'Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [21]:
set_Column_last(table11_12[0].df, False)
table11_12[0].df.set_index(["Sl. No.","State/UT"],inplace=True)
set_Column_last(table11_12[1].df, True)
table11_12[1].df.set_index(["Sl. No.","City"],inplace=True)

In [22]:
cities.append(table11_12[1].df.copy())
states.append(table11_12[0].df.copy())

In [23]:
City  = pd.concat(cities,axis=1)
State = pd.concat(states,axis=1)

In [24]:
# Reset Index before saving file, for better formatting in RAW CSV
State.reset_index().to_csv("Place of Occurrence - wise Road Accident Deaths during 2019 (StateUT-wise).csv",index=False)
City.reset_index().to_csv("Place of Occurrence - wise Road Accident Deaths during 2019 (City-wise).csv",index=False)