In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.9_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.9_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,712.2006789087965,569.4676575121164,71.53779676581283"
                              ],
                          columns=[
                              "48.096930533117934,149.1004846526656,185.65415185783525,221.2458804523425,253.9517932148627,288.5815831987076,319.36361873990313,356.8792245557351,386.6993214862682,420.36717285945076,455.95890145395805,492.5125686591277,530.990113085622"
                              ],
                          split_text=True,
                          pages="1-2"
                         )

In [6]:
table3 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,713.1306071639701,572.3534733441035,84.97312729783218"
                              ],
                          columns=[
                              "49.0588691437803,145.25273021001618,183.73027463651053,222.20781906300488,260.6853634894992,299.16290791599357,366.49861066235866,429.98655896607437,482.8931825525041"
                              ],
                          split_text=True,
                          pages="3"
                         )

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
for each_table in tables:
    clean(each_table)

In [12]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [13]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['Un-Married','Married','Widowed/Widower'],
    ['Divorcee','Separated','Others']
]
len(column_header)

2

In [14]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

def set_Column_last(df):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        ('Status Not Known','Male'),
        ('Status Not Known','Female'),
        ('Status Not Known','Transgender'),
        ('Status Not Known','Total'),
        ('Total','Male'),
        ('Total','Female'),
        ('Total','Transgender'),
        ('Total','Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [15]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [16]:
set_Column_last(table3[0].df)
table3[0].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [18]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
DataFrames.append(table3[0].df.copy())

In [17]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# DataFrames

In [19]:
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [20]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Social Status - wise Distribution of Suicides during 2019 (State & UT-wise).csv",index=False)