In [1]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-2.5_2019_0.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [3]:
import numpy as np
import pandas as pd
import camelot

In [4]:
fileName

'Table-2.5_2019_0.pdf'

In [5]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "21.643618739903072,734.3256733371578,576.682197092084,57.108452573403284"
                              ],
                          columns=[
                              "49.0588691437803,152.94823909531505,184.69221324717287,221.2458804523425,253.9517932148627,285.69576736672053,319.36361873990313,354.95534733441036,390.54707592891765,423.25298869143785,454.0350242326333,492.5125686591277,530.990113085622"
                              ],
                          split_text=True,
                          pages="1-20"
                         )

In [6]:
tables

<TableList n=20>

In [7]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [8]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [9]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES' and table.df.iloc[0,1] != 'AGRA'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)' and table.df.iloc[-1,1] != 'TOTAL (CITIES)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [10]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [11]:
def mergeRows(df, strIndex, stpIndex):
    DF = df.loc[strIndex:stpIndex]
    df = DF.T
    l = []
    for index, each in df.iterrows():
        output = ''
        for ind, cell in each.iteritems():
            output += (str(cell) + ' ')
#         if(output == ''):
#             continue
        l.append(output.strip())
    l = pd.Series(l,name=str(strIndex))
    DF.loc[strIndex] = l

In [12]:
for each_table in tables:
    clean(each_table)

In [13]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

Page No:  1
    0                  1     2    3  4     5     6     7  8     9     10  \
5                  STATES                                                  
6    1     ANDHRA PRADESH   773   55  0   828    73    78  0   151    19   
7    2  ARUNACHAL PRADESH     0    0  0     0     0     1  0     1     0   
8    3              ASSAM    63   19  0    82   124   118  0   242    80   
9    4              BIHAR     4    2  0     6    44    67  0   111    35   
10   5       CHHATTISGARH     5    0  0     5   228   152  0   380    38   
11   6                GOA     2    0  0     2     0     4  0     4     0   
12   7            GUJARAT    78    3  0    81   179   117  0   296    44   
13   8            HARYANA     4    0  0     4   239    78  0   317    43   
14   9   HIMACHAL PRADESH     2    1  0     3    37    44  0    81    24   
15  10    JAMMU & KASHMIR     0    0  0     0     4     9  0    13     4   
16  11          JHARKHAND    11    2  0    13    62    73  0   135    12   


In [14]:
cities = [ each.df.copy() for index, each in enumerate(tables) if(index%2==1) ]
states = [ each.df.copy() for index, each in enumerate(tables) if(index%2==0) ]

In [15]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    ['Bankruptcy or Indebtedness','Marriage Related Issues (Total)','Marriage Related Issues(Non-Settlement of Marriage)'],
    ['Marriage Related Issues(Dowry Related Issues)','Marriage Related Issues(Extra Marital affairs)','Marriage Related Issues(Divorce)'],
    ['Marriage Related Issues(Others)','Failure in Examination','Impotency/Infertility'],
    ['Family Problems','Illness (Total)','Illness (AIDS/STD)'],
    ['Illness (Cancer)','Illness (Paralysis)','Illness (Insanity/Mental illness)'],
    ['Illness (Other Prolonged Illness)','Death of Dear Person','Drug Abuse/Alcoholic Addiction'],
    ['Fall in Social Reputation','Ideological Causes/Hero Worshipping','Love Affairs'],
    ['Poverty','Unemployment','Property Dispute'],
    ['Suspected/ Illicit Relation(Other than Extra Marital Affairs)','Illegitimate Pregnancy(Other than Extra Marital Affairs)','Physical Abuse (Rape, etc.)'],
    ['Professional/Career Problem','Causes Not Known','Other Causes']
]
len(column_header)

10

In [16]:
def set_Column(df, title, city):
    city_state = ['State/UT','Cit']
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if (city == False) else "City",''), 
        (title[0],'Male'),
        (title[0],'Female'),
        (title[0],'Transgender'),
        (title[0],'Total'),
        (title[1],'Male'),
        (title[1],'Female'),
        (title[1],'Transgender'),
        (title[1],'Total'),
        (title[2],'Male'),
        (title[2],'Female'),
        (title[2],'Transgender'),
        (title[2],'Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [17]:
for index,(city,state) in enumerate(zip(cities,states)):
    set_Column(df=city,title=column_header[index],city=True)
    city.set_index(['Sl. No.','City'],inplace=True)
    set_Column(df=state,title=column_header[index],city=False)
    state.set_index(['Sl. No.','State/UT'],inplace=True)

In [18]:
tables_total = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "37.99657512116317,723.7441542627241,556.4814862681745,97.51061631215"
                              ],
                          columns=[
                              "63.487948303715676,204.8929240710824,275.1144426494346,360.72697899838454,435.7581906300485"
                              ],
                          split_text=True,
                          pages="21-22"
                         )

In [19]:
city_total  = tables_total[0].df.copy()
state_total = tables_total[1].df.copy()

In [20]:
def set_Column_last(df, city):
    column_list = [
        ("Sl. No.",''),
        ("State/UT" if (city == False) else "City",''), 
        ("Total",'Male'),
        ("Total",'Female'),
        ("Total",'Transgender'),
        ("Total",'Total')
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [21]:
set_Column_last(df=city_total,city=True)
city_total.set_index(['Sl. No.','City'],inplace=True)

set_Column_last(df=state_total,city=False)
state_total.set_index(['Sl. No.','State/UT'],inplace=True)

In [22]:
cities.append(city_total)
states.append(state_total)

In [23]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [24]:
# Concat all the tables one after the other, as they all have the same indexes.
City  = pd.concat(cities,axis=1)
State = pd.concat(states,axis=1)

In [26]:
# Reset Index before saving file, for better formatting in RAW CSV
City.reset_index().to_csv("Causes – wise Distribution of Suicides during 2019 (City).csv", index=False)
State.reset_index().to_csv("Causes – wise Distribution of Suicides during 2019 (StateUT).csv", index=False)