In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.7_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.7_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "28.37718901453958,729.5158919396879,568.9866882067852,53.26062745542741"
                              ],
                          columns=[
                              "49.0588691437803,151.98630048465267,187.57802907915996,226.0555735056543,255.87567043618742,289.54352180936996,324.1733117932149,362.6508562197092,392.4709531502424,427.1007431340873,459.8066558966075,498.28420032310186,536.7617447495962"
                              ],
                          split_text=True,
                          pages="1-8"
                         )

In [6]:
table9 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "37.03463651050081,716.0485040267723,558.4053634894992,89.81496607619825"
                              ],
                          columns=[
                              "62.52600969305332,210.66455573505655,278.962197092084,357.84116316639745,438.6440064620356"
                              ],
                          split_text=True,
                          pages="9"
                         )

In [7]:
tables

<TableList n=8>

In [8]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [9]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [10]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [11]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                  1  2      3  4      5      6     7  8      9     10  \
4                  STATES                                                    
5    1     ANDHRA PRADESH  0    609  0    609    367    39  0    406    67   
6    2  ARUNACHAL PRADESH  0      9  0      9      7     0  0      7     7   
7    3              ASSAM  0    378  0    378    178     3  0    181    32   
8    4              BIHAR  0    111  0    111     31    15  0     46     1   
9    5       CHHATTISGARH  0    854  0    854    139    11  0    150    71   
10   6                GOA  0     35  0     35     54     2  0     56     4   
11   7            GUJARAT  0   1689  0   1689    818    28  0    846    44   
12   8            HARYANA  0    402  0    402    432    86  0    518    51   
13   9   HIMACHAL PRADESH  0    142  0    142     95    10  0    105    13   
14  10    JAMMU & KASHMIR  0     42  0     42     16     0  0     16    10   
15  11          JHARKHAND  0    180  0    180    262

In [14]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['House wife','Professionals/Salaried Persons (Total)','Professionals/Salaried Persons [Government Servants (Total)]'],
    ['Professionals/Salaried Persons [Government Servants(Central/UT Govt. Servants)]','Professionals/Salaried Persons [Government Servants(State  Govt. Servants)]','Professionals/Salaried Persons [Government Servants (Other Statutory Body/etc.)] '],
    ['Professionals/Salaried Persons (Private Sector Enterprises)','Professionals/Salaried Persons (Public Sector Undertaking)','Students'],
    ['Unemployed Persons','Self-employed Persons(Total)','Self-employed Persons[Business (Total)]'],
    ['Self-employed Persons[Business(Vendor)]','Self-employed Persons[Business(Tradesmen)]','Self-employed Persons[Business(Other Business)]'],
    ['Other Self-employed Persons','Persons Engaged in Farming Sector (Total)','Persons Engaged in Farming Sector [Farmers/Cultivators (Total)]'],
    ['Farmers/Cultivators(Who Cultivate Their Own Land*)','Farmers/Cultivators(Who Cultivate On Lease Land#)','Persons engaged in Farming Sector(Agricultural Laborers)'],
    ['Daily Wage Earner','Retired Persons','Other Persons']
]
len(column_header)

8

In [15]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [16]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [17]:
def set_Column_last(df):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        ('Total','Male'),
        ('Total','Female'),
        ('Total','Transgender'),
        ('Total','Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [19]:
set_Column_last(table9[0].df)
table9[0].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [None]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [20]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
DataFrames.append(table9[0].df.copy())

In [22]:
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [23]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Profession - wise Distribution of Suicides during 2019 (State & UT-wise).csv",index=False)