In [None]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1.9_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

In [None]:
import numpy as np
import pandas as pd
import camelot

In [None]:
fileName

In [None]:
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=[
                              "33.776054792655636,721.2102930792192,564.5445283018868,64.47573400376504"
                              ],
                          columns=[
                              "54.83050080775445,161.60568659127628,203.93098546042006,256.83760904684976,308.7822940226172,340.526268174475,379.00381260096935,430.9484975767367,477.12155088852995,515.5990953150243"
                              ],
                          split_text=True,
                          pages="all"
                         )

In [None]:
tables

In [None]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [None]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [None]:
# Removes Garbage Rows that may have been detected Before the 'STATES' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL (ALL INDIA)' row
def clean(table):
    while(table.df.iloc[0,1] != 'STATES'):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] != 'TOTAL (ALL INDIA)'):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [None]:
# Splits the second column into 1st column index and 2nd column test when faulty/required
def split_by_first_space(df, strIndex, stpIndex):
    df = df.iloc[strIndex:stpIndex+1]
    for index, each in df.iterrows():
        space = each[1].find(' ')
        each[0] = each[1][:space]
        each[1] = each[1][space:]

In [None]:
for each_table in tables:
    clean(each_table)

In [None]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [None]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    "Air Crash",
    "Ship Accidents",
    "Collapse of Structure(Total)",
    "Collapse of Structure (Dwelling House/Residential building)",
    "Collapse of Structure (Official/Commercial Building)",
    "Collapse of Structure (Dam)",
    "Collapse of Structure (Bridge)",
    "Collapse of Structure (Others)",
    "Drowning (Total)",
    "Drowning (Boat Capsize)",
    "Drowning (Accidental Falls into Water body)",
    "Drowning (Other Cases)",
    "Electrocution",
    "Accidental Explosion (Total)",
    "Accidental Explosion (Domestic Gas Cylinder)",
    "Accidental Explosion (Industrial Boiler/Gas Cylinder Explosion)",
    "Accidental Explosion (Ammunition Explosion in Armed Forces/Police/CPMF)",
    "Accidental Explosion (Other)",
    "Falls (Total)",
    "Falls (from Height)",
    "Falls (from Vehicles (Automobile like Bus, Trucks, etc.))",
    "Falls (into Manhole)",
    "Falls (into Pit)",
    "Falls (into Borewell)",
    "Falls (Others)",
    "Factory / Machine Accidents",
    "Accidental Fire (Total)",
    "Accidental Fire (Electrical Short Circuit",
    "Accidental Fire (Riot/Agitation)",
    "Accidental Fire (Fireworks)",
    "Accidental Fire (Cooking Gas Cylinder/Stove Burst)",
    "Accidental Fire (Other Causes)",
    "Firearm",
    "Mines or Quarry Disaster",
    "Traffic Accidents (Total)",
    "Road Accidents",
    "Railway Crossing Accidents",
    "Railway Accidents",
    "Stampede",
    "Sudden Deaths (Total)",
    "Sudden Deaths (Heart Attack)",
    "Sudden Deaths (Others)",
    "Deaths of Women during Pregnancy (Total)",
    "Deaths of Women during Pregnancy (Due to Abortions)",
    "Deaths of Women during Pregnancy (Other than Abortions)",
    "Deaths due to Consumption of Illicit/Spurious Liquor",
    "Killed / Injured by Animals",
    "Poisoning (Total)",
    "Poisoning (Food Poisoning)",
    "Poisoning (Accidental Intake of Insecticides/Pesticides)",
    "Poisoning [Poisonous Gases (Total)]",
    "Poisoning [Poisonous Gases (Carbon Monoxide (CO) Gas)]",
    "Poisoning [Poisonous Gases (Other poisonous Gases)]",
    "Poisoning (Snake Bite)",
    "Poisoning (Animal/Reptiles/Insects Bite)",
    "Poisoning (Other)",
    "Suffocation",
    "Drug Overdose",
    "Other than above Causes",
    "Causes Not Known",
    "Total"
]
len(column_header)

In [None]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",'',''),
        ("State/UT",'',''), 
        (title,"No. of Cases",''),
        (title,'No. of Persons Injured','Male'),
        (title,'No. of Persons Injured','Female'),
        (title,'No. of Persons Injured','Transgender'),
        (title,'No. of Persons Injured','Total'),
        (title,'No. of Persons Died','Male'),
        (title,'No. of Persons Died','Female'),
        (title,'No. of Persons Died','Transgender'),
        (title,'No. of Persons Died','Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [None]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [None]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [None]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [None]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Other Causes-wise Number of Cases, Persons Injured & Persons Died during 2019 (State & UT-wise).csv",index=False)