In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.6_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.6_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "58.82714054927302,690.5591350927103,578.8788368336026,135.89712897573884"
                              ],
                          columns=[
                              "86.01615508885298,272.8788368336026,324.2907915993538,380.64620355411955,437.0016155088853,503.2439418416801"
                              ],
                          split_text=True,
                          row_tol=13,
                          strip_text='\n',
                          pages="all"
                         )

In [6]:
tables

<TableList n=1>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
     0                                                  1      2      3   4  \
0    1                                         House wife      0  21359   0   
1    2             Professionals/Salaried Persons (Total)  11222   1503   0   
2                         2.1 Government Servants (Total)   1428    256   0   
3                         2.1.1 Central/UT Govt. Servants    254     53   0   
4                             2.1.2 State  Govt. Servants    835    133   0   
5                         2.1.3 Other Statutory Body/etc.    339     70   0   
6                          2.2 Private Sector Enterprises   7799    931   0   
7                           2.3 Public Sector Undertaking   1995    316   0   
8    3                                           Students   5562   4772   1   
9    4                                 Unemployed Persons  11599   2416   4   
10   5                      Self-employed Persons (Total)  14319   1777   2   
11                                   5.1

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [11]:
mergeRows(tables[0].df,18,19)
mergeRows(tables[0].df,20,22)
tables[0].df.drop(index=19, inplace=True)
tables[0].df.drop(index=[21,22], inplace=True)

In [12]:
# Reset Indexes
tables[0].df.index = range(len(tables[0].df))
tables[0].df

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,House wife,0,21359,0,21359,15.4
1,2.0,Professionals/Salaried Persons (Total),11222,1503,0,12725,9.1
2,,2.1 Government Servants (Total),1428,256,0,1684,1.2
3,,2.1.1 Central/UT Govt. Servants,254,53,0,307,0.2
4,,2.1.2 State Govt. Servants,835,133,0,968,0.7
5,,2.1.3 Other Statutory Body/etc.,339,70,0,409,0.3
6,,2.2 Private Sector Enterprises,7799,931,0,8730,6.3
7,,2.3 Public Sector Undertaking,1995,316,0,2311,1.7
8,3.0,Students,5562,4772,1,10335,7.4
9,4.0,Unemployed Persons,11599,2416,4,14019,10.1


In [13]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [14]:
split_by_first_space(tables[0].df, 2, 7)
split_by_first_space(tables[0].df, 11, 15)
split_by_first_space(tables[0].df, 17, 20)

In [15]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
        0                                                  1      2      3  \
0       1                                         House wife      0  21359   
1       2             Professionals/Salaried Persons (Total)  11222   1503   
2     2.1                        Government Servants (Total)   1428    256   
3   2.1.1                          Central/UT Govt. Servants    254     53   
4   2.1.2                              State  Govt. Servants    835    133   
5   2.1.3                          Other Statutory Body/etc.    339     70   
6     2.2                         Private Sector Enterprises   7799    931   
7     2.3                          Public Sector Undertaking   1995    316   
8       3                                           Students   5562   4772   
9       4                                 Unemployed Persons  11599   2416   
10      5                      Self-employed Persons (Total)  14319   1777   
11    5.1                                   Business

In [16]:
def set_Column(df):
    column_list = [
        ("Sl. No."),
        ("Profession"),
        ("Male"),
        ("Female"),
        ("Transgender"),
        ("Total"),
        ("Percentage Share"),
        
    ]
    df.columns = column_list

In [17]:
for i in range(len(tables)):
    set_Column(tables[i].df)

In [18]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
tables[0].df

Unnamed: 0,Sl. No.,Profession,Male,Female,Transgender,Total,Percentage Share
0,1,House wife,0,21359,0,21359,15.4
1,2,Professionals/Salaried Persons (Total),11222,1503,0,12725,9.1
2,2.1,Government Servants (Total),1428,256,0,1684,1.2
3,2.1.1,Central/UT Govt. Servants,254,53,0,307,0.2
4,2.1.2,State Govt. Servants,835,133,0,968,0.7
5,2.1.3,Other Statutory Body/etc.,339,70,0,409,0.3
6,2.2,Private Sector Enterprises,7799,931,0,8730,6.3
7,2.3,Public Sector Undertaking,1995,316,0,2311,1.7
8,3,Students,5562,4772,1,10335,7.4
9,4,Unemployed Persons,11599,2416,4,14019,10.1


In [19]:
# Reset Index before saving file, for better formatting in RAW CSV
tables[0].df.to_csv("Profession - wise Distribution of Suicides during 2019 (All India).csv",index=False)