In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.11_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.11_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,738.1734984551337,573.3154119547659,71.53779676581283"
                              ],
                          columns=[
                              "48.096930533117934,152.94823909531505,191.4257835218094,226.0555735056543,260.6853634894992,294.3532148626818,326.0971890145396,364.57473344103397,392.4709531502424,424.2149273021002,459.8066558966075,497.32226171243946,532.9139903069467"
                              ],
                          split_text=True,
                          pages="1-3"
                         )

In [6]:
table4 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "25.972342487883687,732.0169841376875,533.8759289176091,78.84867036128217"
                              ],
                          columns=[
                              "61.56407108239096,217.39812600969307,280.88607431340876,355.91728594507276,431.91043618739906"
                              ],
                          split_text=True,
                          pages="4"
                         )

In [7]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [8]:
clean(table4[0])
table4[0].df

Unnamed: 0,0,1,2,3,4,5
2,,STATES,,,,
3,1.0,ANDHRA PRADESH,4740.0,1725.0,0.0,6465.0
4,2.0,ARUNACHAL PRADESH,81.0,31.0,0.0,112.0
5,3.0,ASSAM,1679.0,691.0,0.0,2370.0
6,4.0,BIHAR,395.0,246.0,0.0,641.0
7,5.0,CHHATTISGARH,5520.0,2109.0,0.0,7629.0
8,6.0,GOA,201.0,58.0,0.0,259.0
9,7.0,GUJARAT,5168.0,2486.0,1.0,7655.0
10,8.0,HARYANA,3297.0,894.0,0.0,4191.0
11,9.0,HIMACHAL PRADESH,377.0,207.0,0.0,584.0


In [9]:
tables

<TableList n=3>

In [10]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [11]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
     0                              1         2      3      4      5      6   \
0                                       Male \nF  emale         Total   Male   
1                                                     g  ender                 
2   (1)                            (2)       (3)    (4)    (5)    (6)    (7)   
3                               STATES                                         
4     1                 ANDHRA PRADESH      1199    501      0   1700    916   
5     2              ARUNACHAL PRADESH         3      0      0      3      7   
6     3                          ASSAM       338    161      0    499    407   
7     4                          BIHAR        49     45      0     94     27   
8     5                   CHHATTISGARH       538    300      0    838   1005   
9     6                            GOA        14      9      0     23     14   
10    7                        GUJARAT       519    351      0    870    865   
11    8                     

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                  1      2     3  4      5      6     7  8      9   \
3                  STATES                                                 
4    1     ANDHRA PRADESH   1199   501  0   1700    916   321  0   1237   
5    2  ARUNACHAL PRADESH      3     0  0      3      7     2  0      9   
6    3              ASSAM    338   161  0    499    407   194  0    601   
7    4              BIHAR     49    45  0     94     27    29  0     56   
8    5       CHHATTISGARH    538   300  0    838   1005   382  0   1387   
9    6                GOA     14     9  0     23     14     8  0     22   
10   7            GUJARAT    519   351  0    870    865   523  1   1389   
11   8            HARYANA    453   161  0    614    454   125  0    579   
12   9   HIMACHAL PRADESH     33    15  0     48     42    26  0     68   
13  10    JAMMU & KASHMIR     33    35  0     68     14    14  0     28   
14  11          JHARKHAND     98    73  0    171     95    44  0    139   
15  12       

In [14]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['No Education','Primary (up to class–5th)','Middle (up to class–8th)'],
    ['Matriculate/ Secondary(up to class–10th)','Hr. Secondary/ Intermediate/Pre-University (up to class–12th)','Diploma/Certificate/ ITI'],
    ['Graduate and above','Professionals (MBA etc.)','Status Not known ']
]
len(column_header)

3

In [15]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",''),
        ("State/UT",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [16]:
#Sets column headers for pages 1-3
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [17]:
#Sets column headers for page 4
table4[0].df.columns = pd.MultiIndex.from_tuples(
    [
        ("Sl. No.",''),
        ("State/UT",''), 
        ('Total','Male'),
        ('Total','Female'),
        ('Total','Transgender'),
        ('Total','Total')
    ]
)
# Sets Index to first 2 columns
table4[0].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [18]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [19]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
DataFrames.append(table4[0].df.copy())

In [20]:
# DataFrames

In [21]:
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [22]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Educational Status - wise Distribution of Suicides during 2019 (State & UT-wise).csv",index=False)