In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.13_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.13_2019.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "33.18688206785138,718.9343728652543,562.2531179321487,63.842146529861076"
                              ],
                          columns=[
                              "50.982746365105015,165.4534410339257,203.93098546042006,242.4085298869144,280.88607431340876,330.9068820678514,378.041873990307,425.1768659127626,476.1596122778676,521.3707269789984"
                              ],
                          split_text=True,
                          pages="all"
                         )

In [6]:
tables

<TableList n=2>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                   1   2  3   4  5  6  7   8  9   10
1                   STATES                               
2    1      ANDHRA PRADESH   6  1   6  0  0  0   0  0   0
3    2   ARUNACHAL PRADESH   0  0   0  0  0  0   0  0   0
4    3               ASSAM   0  0   0  0  0  0   0  0   0
5    4               BIHAR   0  0   0  0  0  0   0  0   0
6    5        CHHATTISGARH   0  0   0  0  0  0   0  0   0
7    6                 GOA   0  0   0  0  0  0   0  0   0
8    7             GUJARAT   0  0   0  0  0  0   0  0   0
9    8             HARYANA   0  0   0  0  0  0   0  0   0
10   9    HIMACHAL PRADESH   0  0   0  0  0  0   0  0   0
11  10     JAMMU & KASHMIR   0  0   0  0  0  0   0  0   0
12  11           JHARKHAND   0  0   0  0  0  0   8  0   8
13  12           KARNATAKA   0  0   0  0  0  0   0  0   0
14  13              KERALA   4  0   4  0  0  0   2  0   2
15  14      MADHYA PRADESH  12  0  12  0  0  0   0  0   0
16  15         MAHARASHTRA   0  0   0  0  0  0   0  0   0
17

In [14]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ("Fault of Driver","Sabotage by Extremist/Terrorist/Others","Signalmen's Error"),
    ("Mechanical Failures (like Poor Design, Track Faults,Bridge/Tunnel Collapse etc.)","Other Causes","Total")
]
len(column_header)

2

In [15]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Cases'),
        (title[0],'Injured'),
        (title[0],'Dead'),
        (title[1],'Cases'),
        (title[1],'Injured'),
        (title[1],'Dead'),
        (title[2],'Cases'),
        (title[2],'Injured'),
        (title[2],'Dead')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [16]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [17]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [18]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]

In [19]:
# Setting a common Index for concatenation
DataFrames[1].index = DataFrames[0].index.copy()

In [20]:
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [21]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Cause - wise Distribution of Railway Accidents during 2019 (State & UT-wise).csv",index=False)