In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1A.2_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-1A.2_2019.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "24.52943457189015,721.8202417037362,574.7583198707594,60.956277691379164"
                              ],
                          columns=[
                              "47.134991922455576,148.13854604200324,190.6160904684976,220.28394184168016,255.87567043618742,288.5815831987076,321.2874959612278,351.10759289176093,382.8515670436188,413.63360258481424,447.30145395799684,485.77899838449116,528.104297253635"
                              ],
                          split_text=True,
                          pages="all"
                         )

In [6]:
tables

<TableList n=2>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                  1       2       3       4      5     6      7     8   \
1                  STATES                                                     
2    1     ANDHRA PRADESH   20677   24619    7984   1511     1   1519     0   
3    2  ARUNACHAL PRADESH     200     275     109      0     0      0     0   
4    3              ASSAM    8055    7081    3245    591     0    593     5   
5    4              BIHAR   10007    7206    7205   1589     0   1622   241   
6    5       CHHATTISGARH   13899   13089    5003    466     0    466     1   
7    6                GOA    3440    1460     299     50     0     50     0   
8    7            GUJARAT   16503   15976    7428    994     0   1012     0   
9    8            HARYANA   10937    9247    5269   1362   126   1278   104   
10   9   HIMACHAL PRADESH    2896    4740    1124      0     0      0     0   
11  10    JAMMU & KASHMIR    5795    7517    1009     44     0     45     0   
12  11          JHARKHAND    5217    381

In [14]:
def set_Column(df, state):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if(state) else "City",''), 
        ('Road Accidents','Cases'),
        ('Road Accidents','Injured'),
        ('Road Accidents','Died'),
        ('Railway Accidents','Cases'),
        ('Railway Accidents','Injured'),
        ('Railway Accidents','Died'),
        ('Railway Crossing Accidents','Cases'),
        ('Railway Crossing Accidents','Injured'),
        ('Railway Crossing Accidents','Died'),
        ('Total Traffic Accidents','Cases'),
        ('Total Traffic Accidents','Injured'),
        ('Total Traffic Accidents','Died'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [15]:
for i in range(len(tables)):
    state = not i
    set_Column(tables[i].df, state)

In [16]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [17]:
# Reset Index before saving file, for better formatting in RAW CSV
tables[0].df.to_csv("Cases Reported and Persons Injured and Died due to Traffic Accidents during 2019 (StateUT-wise).csv",index=False)
tables[1].df.to_csv("Cases Reported and Persons Injured and Died due to Traffic Accidents during 2019 (City-wise).csv",index=False)