In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.13_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.13_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,740.0974110141216,567.5437802907917,64.80410280935504"
                              ],
                          columns=[
                              "49.0588691437803,148.13854604200324,186.6160904684976,225.09363489499194,257.79954765751216,286.65770597738293,323.21137318255256,359.7650403877222,386.6993214862682,424.2149273021002,455.95890145395805,491.55063004846534,530.0281744749597"
                              ],
                          split_text=True,
                          pages="1-5"
                         )

In [6]:
tables

<TableList n=5>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
for each_table in tables:
    clean(each_table)

In [11]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                  1    2    3  4    5     6     7  8     9     10    11  \
4                  STATES                                                      
5    1     ANDHRA PRADESH   19    8  0   27   219   139  0   358    64    61   
6    2  ARUNACHAL PRADESH    0    0  0    0     9     3  0    12     0     0   
7    3              ASSAM    6    5  0   11   207    54  0   261     8    18   
8    4              BIHAR   30   10  0   40    56    17  0    73    12    18   
9    5       CHHATTISGARH    7    0  0    7   238    53  0   291   110   238   
10   6                GOA    1    0  0    1    18    12  0    30     4     5   
11   7            GUJARAT    8    4  0   12   380   135  0   515   130   251   
12   8            HARYANA    0    2  0    2   261    37  0   298    27    20   
13   9   HIMACHAL PRADESH    0    0  0    0     9     4  0    13     4     5   
14  10    JAMMU & KASHMIR    0    0  0    0     4     3  0     7     4     4   
15  11          JHARKHAND   

In [12]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['By Consuming Sleeping Pills','By Drowning','By Fire/Self Immolation'],
    ['By Firearms','By Hanging','By Poison (Total)'],
    ['By Poison (By Consuming Insecticides)','By Poison (By Consuming other Poison)','By Self Inflicting  Injury'],
    ['Jumping (Total)','Jumping (from Building)','Jumping (from Other Sites)'],
    ['Jumping (by Jumping off Moving Trains/Vehicles)','By Coming under Running Vehicles/Trains','By Touching Electric Wire']
]
len(column_header)

5

In [13]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [14]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [15]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [16]:
table6 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,709.3148100703146,563.6960258481422,83.08127211974046"
                              ],
                          columns=[
                              "49.0588691437803,150.06242326332796,188.5399676898223,229.90332794830374,273.1905654281099,326.0971890145396,393.43289176090474,447.30145395799684,505.9797092084007"
                              ],
                          split_text=True,
                          pages="6"
                         )

In [17]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [18]:
set_Column(table6[0].df, ['By Other Means','Total'])
table6[0].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [19]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
DataFrames.append(table6[0].df.copy())

In [20]:
# Use First page's indexes on all DFs for uniformity when concatenating
index = tables[0].df.index.copy()
for i, each_df in enumerate(DataFrames):
    each_df.index = index

In [21]:
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [22]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Distribution of Suicides by Means Adopted during 2019 (State & UT-wise).csv",index=False)