In [4]:
import wget
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
# Enter NCRB Individual PDF url
pdf_url = 'https://ncrb.gov.in/sites/default/files/adsi_reports_previous_year/Table-1.7_2019.pdf'
fileName = pdf_url[67:]
try:
    #Delete the file to redownload it
    os.remove(fileName)
except FileNotFoundError as e:
    #Ignore in case FileNotFound Error occurs
    pass
# Download report
fileName = wget.download(pdf_url,fileName)

## What the structure of the final DataFrame is supposed to look like after stitching(concatinating)?

| tableA   | tableB   | tableC   |
| -------- | -------- | -------- |
| Page1    | Page3    | Page5    |
| Page2    | Page4    | Page6    |

In [6]:
import numpy as np
import pandas as pd
import camelot

In [7]:
fileName

'Table-1.7_2019.pdf'

In [163]:
#Scraping Pages 1 to 3
tables = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=["21.162649434571893,728.5539356601939,582.9347980613894,115.0160479455716"],
                          columns=["44.2491760904685,174.11088852988692,207.7787399030695,246.25628432956384,274.15250403877224,307.8203554119548,346.29789983844915,381.88962843295644,409.7858481421648,447.30145395799684,482.8931825525041,518.4849111470114,546.3811308562198"],
                          split_text=True,
                          row_tol=12,
                          pages="1-3"
                         )

In [158]:
tables

<TableList n=3>

In [159]:
def printnAllTables(tables):
    for i in range(len(tables)):
        print("Page No: ",(i+1))
        print(tables[i].df)
        print()

In [None]:
# Incase we're intersted in seeing the original scraped tables.
# Commented as it takes too much screen space
# Run if you want
printnAllTables(tables)

In [160]:
# Removes Garbage Rows that may have been detected Before the 'Air Crash' or 'Sudden Deaths (Total)' row
# AND
# Removes Garbage Rows that may have been detected after the 'TOTAL' or 'Stampede' row
def clean(table):
    while(table.df.iloc[0,1] not in  ['Air Crash','Sudden Deaths (Total)']):
        table.df = table.df.iloc[1:]
    while(table.df.iloc[-1,1] not in  ['Total', 'Stampede']):
        table.df = table.df.iloc[:-1]
    # The Return line is unnecessary due to all changes being reflected in the original references
    return table

In [161]:
for each_table in tables:
    clean(each_table)

In [None]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

In [164]:
#Scraping Page 4
table4 = camelot.read_pdf(fileName,
                          flavor='stream',
                          table_areas=["21.162649434571893,713.1626351882904,582.9347980613894,385.13554388084685"],
                          columns=["44.2491760904685,167.37731825525043,204.8929240710824,243.37046849757675,267.4189337641357,305.8964781906301,344.3740226171244,382.8515670436188,409.7858481421648,446.33951534733444,479.04542810985464,517.522972536349,544.4572536348951"],
                          split_text=True,
                          row_tol=12,
                          pages="4"
                         )

In [165]:
table4[0].df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.0,Sudden Deaths (Total),11373,1877,4,13254,14421,2188,2,16611,8323,2182,0,10505
1,,14.1 Heart Attack,6858,893,1,7752,9706,1334,2,11042,5267,1345,0,6612
2,,14.2 Others,4515,984,3,5502,4715,854,0,5569,3056,837,0,3893
3,15.0,Deaths of Women during \nPregnancy (Total),0,304,0,304,0,17,0,17,0,1,0,1
4,,15.1 Due to Abortions,0,34,0,34,0,7,0,7,0,0,0,0
5,,15.2 Other than Abortions,0,270,0,270,0,10,0,10,0,1,0,1
6,16.0,Deaths Due to Consumption of \nIllicit/Spuriou...,498,48,0,546,335,25,0,360,98,5,0,103
7,17.0,Killed / Injured by Animals,292,70,0,362,353,100,0,453,203,73,0,276
8,18.0,Poisoning (Total),4535,1943,0,6478,3046,1324,0,4370,1186,571,1,1758
9,,18.1 Food Poisoning,236,117,0,353,154,88,0,242,38,42,0,80


In [None]:
# List of Column headers as column are nested, i.e exhibit a multi level/index structure
# The multi level structre will be generated and applied in the next 
column_header = [
    "Air Crash",
    "Ship Accidents",
    "Collapse of Structure(Total)",
    "Collapse of Structure (Dwelling House/Residential building)",
    "Collapse of Structure (Official/Commercial Building)",
    "Collapse of Structure (Dam)",
    "Collapse of Structure (Bridge)",
    "Collapse of Structure (Others)",
    "Drowning (Total)",
    "Drowning (Boat Capsize)",
    "Drowning (Accidental Falls into Water body)",
    "Drowning (Other Cases)",
    "Electrocution",
    "Accidental Explosion (Total)",
    "Accidental Explosion (Domestic Gas Cylinder)",
    "Accidental Explosion (Industrial Boiler/Gas Cylinder Explosion)",
    "Accidental Explosion (Ammunition Explosion in Armed Forces/Police/CPMF)",
    "Accidental Explosion (Other)",
    "Falls (Total)",
    "Falls (from Height)",
    "Falls (from Vehicles (Automobile like Bus, Trucks, etc.))",
    "Falls (into Manhole)",
    "Falls (into Pit)",
    "Falls (into Borewell)",
    "Falls (Others)",
    "Factory / Machine Accidents",
    "Accidental Fire (Total)",
    "Accidental Fire (Electrical Short Circuit",
    "Accidental Fire (Riot/Agitation)",
    "Accidental Fire (Fireworks)",
    "Accidental Fire (Cooking Gas Cylinder/Stove Burst)",
    "Accidental Fire (Other Causes)",
    "Firearm",
    "Mines or Quarry Disaster",
    "Traffic Accidents (Total)",
    "Road Accidents",
    "Railway Crossing Accidents",
    "Railway Accidents",
    "Stampede",
    "Sudden Deaths (Total)",
    "Sudden Deaths (Heart Attack)",
    "Sudden Deaths (Others)",
    "Deaths of Women during Pregnancy (Total)",
    "Deaths of Women during Pregnancy (Due to Abortions)",
    "Deaths of Women during Pregnancy (Other than Abortions)",
    "Deaths due to Consumption of Illicit/Spurious Liquor",
    "Killed / Injured by Animals",
    "Poisoning (Total)",
    "Poisoning (Food Poisoning)",
    "Poisoning (Accidental Intake of Insecticides/Pesticides)",
    "Poisoning [Poisonous Gases (Total)]",
    "Poisoning [Poisonous Gases (Carbon Monoxide (CO) Gas)]",
    "Poisoning [Poisonous Gases (Other poisonous Gases)]",
    "Poisoning (Snake Bite)",
    "Poisoning (Animal/Reptiles/Insects Bite)",
    "Poisoning (Other)",
    "Suffocation",
    "Drug Overdose",
    "Other than above Causes",
    "Causes Not Known",
    "Total"
]
len(column_header)

In [None]:
def set_Column(df, title):
    column_list = [
        ("Sl. No.",'',''),
        ("State/UT",'',''), 
        (title,"No. of Cases",''),
        (title,'No. of Persons Injured','Male'),
        (title,'No. of Persons Injured','Female'),
        (title,'No. of Persons Injured','Transgender'),
        (title,'No. of Persons Injured','Total'),
        (title,'No. of Persons Died','Male'),
        (title,'No. of Persons Died','Female'),
        (title,'No. of Persons Died','Transgender'),
        (title,'No. of Persons Died','Total'),
    ]
    df.columns = pd.MultiIndex.from_tuples(column_list)

In [None]:
for i in range(len(tables)):
    set_Column(tables[i].df, column_header[i])
    tables[i].df.set_index(["Sl. No.","State/UT"],inplace=True)

In [None]:
# Incase we're intersted in seeing the scraped tables at this point.
# Commented as it takes too much screen space
# Run if you want
# printnAllTables(tables)

# OR Print a simgle one according to your choice
# tables[23].df

In [None]:
# Convert List of tables to list of DataFrames
DataFrames = [each.df.copy() for each in tables]
# Concat all the tables one after the other, as they all have the same indexes.
final = pd.concat(DataFrames,axis=1)

In [None]:
# Reset Index before saving file, for better formatting in RAW CSV
final.reset_index().to_csv("Other Causes-wise Number of Cases, Persons Injured & Persons Died during 2019 (State & UT-wise).csv",index=False)