In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.3_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.3_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "56.273408723747984,744.9071924115915,540.1285298869144,79.23344700176459"
                              ],
                          columns=[
                              "76.9550888529887,227.01751211631665,296.2770920840065,396.3187075928918"
                              ],
                          split_text=True,
                          pages="all"
                         )

In [6]:
tables

<TableList n=2>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'CITIES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
for each_table in tables:
    clean(each_table)

In [11]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
     0                  1       2       3       4
4                  STATES                        
5    1     ANDHRA PRADESH    5319    6465    21.5
6    2  ARUNACHAL PRADESH     132     112   -15.2
7    3              ASSAM    2379    2370    -0.4
8    4              BIHAR     443     641    44.7
9    5       CHHATTISGARH    7046    7629     8.3
10   6                GOA     256     259     1.2
11   7            GUJARAT    7793    7655    -1.8
12   8            HARYANA    3547    4191    18.2
13   9   HIMACHAL PRADESH     740     584   -21.1
14  10    JAMMU & KASHMIR     330     284   -13.9
15  11          JHARKHAND    1317    1646    25.0
16  12          KARNATAKA   11561   11288    -2.4
17  13             KERALA    8237    8556     3.9
18  14     MADHYA PRADESH   11775   12457     5.8
19  15        MAHARASHTRA   17972   18916     5.3
20  16            MANIPUR      52      58    11.5
21  17          MEGHALAYA     189     198     4.8
22  18            MIZORAM      79     

In [12]:
def set_Column(df, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if (not city) else "City",''), 
        ('Number of Suicides','2018'),
        ('Number of Suicides','2019'),
        ('Percentage Variation in 2019 over 2018','')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [13]:
for i in range(len(tables)):
    city = i==1
    set_Column(df=tables[i].df, city=city)
    # tables[i].df.set_index(["Sl. No.","State/UT" if (not city) else "City"],inplace=True)

In [14]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [15]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames)

In [16]:
# Reset Index before saving file, for better formatting in RAW CSV
tables[0].df.to_csv("Incidence of Suicides during 2018 and 2019 and Their Percentage Variation (StateUT-wise) .csv",index=False)
tables[1].df.to_csv("Incidence of Suicides during 2018 and 2019 and Their Percentage Variation (City-wise) .csv",index=False)