In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import calendar

In [4]:
import requests
import urllib.parse

In [5]:
from geopy.geocoders import Nominatim
import time
from pprint import pprint

In [6]:
from geopy.geocoders import ArcGIS

In [7]:
app = Nominatim(user_agent="tutorial")

In [8]:
df = pd.read_csv("disasters_clean_lat_lon.csv")

In [9]:
pd.set_option('display.max_columns', None)

In [14]:
df = df.interpolate(method="polynomial",order=2)

In [20]:
# Linearly interpolate numeric columns
# Linearly interpolate numeric columns
numeric = df[['Start Day', 'Start Month', 'End Day', 'End Month', 'CPI','Total Affected', 'No Homeless', 'No Affected', 'No Injured', 'Total Deaths']]
numeric_columns = numeric.columns

df[numeric_columns] = df[numeric_columns].interpolate(method='linear',limit_direction='forward')

# Calculate the median and interquartile range of the date columns
start_month_median = df['Start Month'].median()
start_month_iqr = df['Start Month'].quantile(0.75) - df['Start Month'].quantile(0.25)

start_day_median = df['Start Day'].median()
start_day_iqr = df['Start Day'].quantile(0.75) - df['Start Day'].quantile(0.25)

end_month_median = df['End Month'].median()
end_month_iqr = df['End Month'].quantile(0.75) - df['End Month'].quantile(0.25)

end_day_median = df['End Day'].median()
end_day_iqr = df['End Day'].quantile(0.75) - df['End Day'].quantile(0.25)

# Calculate the median and interquartile range of the numeric columns

total_affected_median = df['Total Affected'].median()
total_affected_iqr = df['Total Affected'].quantile(0.75) - df['Total Affected'].quantile(0.25)

no_homeless_median = df['No Homeless'].median()
no_homeless_iqr = df['No Homeless'].quantile(0.75) - df['No Homeless'].quantile(0.25)

no_affected_median = df['No Affected'].median()
no_affected_iqr = df['No Affected'].quantile(0.75) - df['No Affected'].quantile(0.25)

no_injured_median = df['No Injured'].median()
no_injured_iqr = df['No Injured'].quantile(0.75) - df['No Injured'].quantile(0.25)

total_deaths_median = df['Total Deaths'].median()
total_deaths_iqr = df['Total Deaths'].quantile(0.75) - df['Total Deaths'].quantile(0.25)

# Replace any outliers with the median and interquartile range
df['Start Month'] = df['Start Month'].apply(lambda x: start_month_median if x > start_month_median + start_month_iqr * 1.5 or x < start_month_median - start_month_iqr * 1.5 else x)
df['Start Day'] = df['Start Day'].apply(lambda x: start_day_median if x > start_day_median + start_day_iqr * 1.5 or x < start_day_median - start_day_iqr * 1.5 else x)
df['End Month'] = df['End Month'].apply(lambda x: end_month_median if x > end_month_median + end_month_iqr * 1.5 or x < end_month_median - end_month_iqr * 1.5 else x)
df['End Day'] = df['End Day'].apply(lambda x: end_day_median if x > end_day_median + end_day_iqr * 1.5 or x < end_day_median - end_day_iqr * 1.5 else x)

df['Total Affected'] = df['Total Affected'].apply(lambda x: total_affected_median if x > total_affected_median + total_affected_iqr * 1.5 or x < total_affected_median - total_affected_iqr * 1.5 else x)
df['No Homeless'] = df['No Homeless'].apply(lambda x: no_homeless_median if x > no_homeless_median + no_homeless_iqr * 1.5 or x < no_homeless_median - no_homeless_iqr * 1.5 else x)
df['No Affected'] = df['No Affected'].apply(lambda x: no_affected_median if x > no_affected_median + no_affected_iqr * 1.5 or x < no_affected_median - no_affected_iqr * 1.5 else x)
df['No Injured'] = df['No Injured'].apply(lambda x: no_injured_median if x > no_injured_median + no_injured_iqr * 1.5 or x < no_injured_median - no_injured_iqr * 1.5 else x)
df['Total Deaths'] = df['Total Deaths'].apply(lambda x: total_deaths_median if x > total_deaths_median + total_deaths_iqr * 1.5 or x < total_deaths_median - total_deaths_iqr * 1.5 else x)

# Replace infinite values with the median and interquartile range
df['Start Month'] = df['Start Month'].apply(lambda x: start_month_median if np.isinf(x) else x)
df['Start Day'] = df['Start Day'].apply(lambda x: start_day_median if np.isinf(x) else x)
df['End Month'] = df['End Month'].apply(lambda x: end_month_median if np.isinf(x) else x)
df['End Day'] = df['End Day'].apply(lambda x: end_day_median if np.isinf(x) else x)

df['Total Affected'] = df['Total Affected'].apply(lambda x: total_affected_median if np.isinf(x) else x)
df['No Homeless'] = df['No Homeless'].apply(lambda x: no_homeless_median if np.isinf(x) else x)
df['No Affected'] = df['No Affected'].apply(lambda x: no_affected_median if np.isinf(x) else x)
df['No Injured'] = df['No Injured'].apply(lambda x: no_injured_median if np.isinf(x) else x)
df['Total Deaths'] = df['Total Deaths'].apply(lambda x: total_deaths_median if np.isinf(x) else x)

# Replace non-finite values with the median and interquartile range
df['Start Month'] = df['Start Month'].apply(lambda x: start_month_median if not np.isfinite(x) else x)
df['Start Day'] = df['Start Day'].apply(lambda x: start_day_median if not np.isfinite(x) else x)
df['End Month'] = df['End Month'].apply(lambda x: end_month_median if not np.isfinite(x) else x)
df['End Day'] = df['End Day'].apply(lambda x: end_day_median if not np.isfinite(x) else x)

df['Total Affected'] = df['Total Affected'].apply(lambda x: total_affected_median if not np.isfinite(x) else x)
df['No Homeless'] = df['No Homeless'].apply(lambda x: no_homeless_median if not np.isfinite(x) else x)
df['No Affected'] = df['No Affected'].apply(lambda x: no_affected_median if not np.isfinite(x) else x)
df['No Injured'] = df['No Injured'].apply(lambda x: no_injured_median if not np.isfinite(x) else x)
df['Total Deaths'] = df['Total Deaths'].apply(lambda x: total_deaths_median if not np.isfinite(x) else x)

# Throw away fractorial parts with a cast to int
df[
    ['Start Year','Start Month','Start Day',
     'End Year','End Month','End Day',
     'Total Affected',
     'No Homeless','No Affected','No Injured',
     'Total Deaths'
    ]
] = df[
    ['Start Year','Start Month','Start Day',
     'End Year','End Month','End Day',
     'Total Affected',
     'No Homeless','No Affected','No Injured',
     'Total Deaths'
    ]
].astype(np.int64)

# Check if the day is out of range for the given month, and if it is, increment the month by 1 and set the day to 1
def check_start_date_validity(row):
    # Get the year, month, and day
    year = row['Start Year']
    month = row['Start Month']
    day = row['Start Day']
    # Check if the month is February
    if month == 2:
        # Check if the day is greater than 28
        if day > 28:
            # Set the day to 1
            row['Start Day'] = 1
            # Increment the month by 1
            row['Start Month'] += 1
    else:
        # Get the days in the month
        days_in_month = calendar.monthrange(year, month)[1]
        # Check if the day is greater than the days in the month
        if day > days_in_month:
            # Set the day to 1
            row['Start Day'] = 1
            # Increment the month by 1
            row['Start Month'] += 1
            # Check if the month is greater than 12
            if row['Start Month'] > 12:
                # Set the month to 1
                row['Start Month'] = 1
                # Increment the year by 1
                row['Start Year'] += 1
    # Return the row
    return row

def check_end_date_validity(row):
    # Get the year, month, and day
    year = row['End Year']
    month = row['End Month']
    day = row['End Day']
    # Check if the month is February
    if month == 2:
        # Check if the day is greater than 28
        if day > 28:
            # Set the day to 1
            row['End Day'] = 1
            # Increment the month by 1
            row['End Month'] += 1
    else:
        # Get the days in the month
        days_in_month = calendar.monthrange(year, month)[1]
        # Check if the day is greater than the days in the month
        if day > days_in_month:
            # Set the day to 1
            row['End Day'] = 1
            # Increment the month by 1
            row['End Month'] += 1
            # Check if the month is greater than 12
            if row['End Month'] > 12:
                # Set the month to 1
                row['End Month'] = 1
                # Increment the year by 1
                row['End Year'] += 1
    # Return the row
    return row

# Apply the function to the dataframe
df[['Start Year','Start Month','Start Day']] = df[['Start Year','Start Month','Start Day']].apply(check_start_date_validity, axis=1)
df[['End Year','End Month','End Day']] = df[['End Year','End Month','End Day']].apply(check_end_date_validity, axis=1)

In [14]:
df["No Injured"] = df["No Injured"].astype(float)

In [15]:
df["No Injured"] = df["No Injured"].interpolate(method="linear")

In [17]:
display(df.head(15))

Unnamed: 0.1,Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,Location,Origin,Dis Mag Value,Dis Mag Scale,Latitude,Longitude,River Basin,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Insured Damages ('000 US$),Total Damages ('000 US$),CPI,Total Damages Adjusted ('000 US$)
0,1,1900,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,Bengal,,,Km2,22.351115,78.667743,,1900,,,1900,,,1250000.0,,,,,,,3.221647,
1,2,1902,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,"Quezaltenango, San Marcos",,8.0,Richter,15.585555,-90.345759,,1902,4.0,18.0,1902,4.0,18.0,2000.0,,,,,,25000.0,3.350513,746154.36
2,3,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,,,,,15.585555,-90.345759,,1902,4.0,8.0,1902,4.0,8.0,1000.0,,,,,,,3.350513,
3,4,1902,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,,,,,15.585555,-90.345759,,1902,10.0,24.0,1902,10.0,24.0,6000.0,,,,,,,3.350513,
4,7,1904,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,Chittagong,,,Kph,24.476929,90.293441,,1904,11.0,,1904,11.0,,,,,,,,,3.479379,
5,9,1905,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,Kangra,,8.0,Richter,22.351115,78.667743,,1905,4.0,4.0,1905,4.0,4.0,20000.0,,,,,,25000.0,3.479379,718519.01
6,10,1906,Geophysical,Earthquake,Ground movement,Chile,CHL,South America,Americas,Valparaiso,,8.0,Richter,-31.761336,-71.31877,,1906,8.0,16.0,1906,8.0,16.0,20000.0,,,,,,100000.0,3.479379,2874076.05
7,11,1906,Geophysical,Earthquake,Ground movement,Colombia,COL,South America,Americas,Tumako,,9.0,Richter,4.099917,-72.908813,,1906,1.0,31.0,1906,1.0,31.0,400.0,,,,,,,3.479379,
8,12,1906,Hydrological,Flood,,Belgium,BEL,Western Europe,Europe,Louvain region,,,Km2,50.640281,4.666715,,1906,5.0,14.0,1906,5.0,14.0,6.0,,,,,,,3.479379,
9,13,1906,Hydrological,Flood,,Belgium,BEL,Western Europe,Europe,,,,Km2,50.640281,4.666715,,1906,4.0,,1906,4.0,,,,,,,,,3.479379,


In [22]:
df.to_csv("disasters_clean.csv", index = True)