In [1]:
import pandas as pd

In [2]:
RAW_FILE_PATH = "../data/raw/tsunami-events.tsv"
PROCESSED_FILE_PATH = "../data/processed/tsunami-events.csv"

### Data Cleaning

In [3]:
tsunami_df = pd.read_csv(RAW_FILE_PATH, sep="\t")
tsunami_df.head()

Unnamed: 0,Search Parameters,Year,Mo,Dy,Hr,Mn,Sec,Tsunami Event Validity,Tsunami Cause Code,Earthquake Magnitude,...,Total Missing,Total Missing Description,Total Injuries,Total Injuries Description,Total Damage ($Mil),Total Damage Description,Total Houses Destroyed,Total Houses Destroyed Description,Total Houses Damaged,Total Houses Damaged Description
0,[],,,,,,,,,,...,,,,,,,,,,
1,,-2000.0,,,,,,1.0,1.0,,...,,,,,,4.0,,,,
2,,-1610.0,,,,,,4.0,6.0,,...,,,,,,3.0,,,,
3,,-1365.0,,,,,,1.0,1.0,,...,,,,,,3.0,,,,
4,,-1300.0,,,,,,2.0,0.0,6.0,...,,,,,,,,,,


In [4]:
tsunami_df.columns

Index(['Search Parameters', 'Year', 'Mo', 'Dy', 'Hr', 'Mn', 'Sec',
       'Tsunami Event Validity', 'Tsunami Cause Code', 'Earthquake Magnitude',
       'Vol', 'More Info', 'Deposits', 'Country', 'Location Name', 'Latitude',
       'Longitude', 'Maximum Water Height (m)', 'Number of Runups',
       'Tsunami Magnitude (Abe)', 'Tsunami Magnitude (Iida)',
       'Tsunami Intensity', 'Deaths', 'Death Description', 'Missing',
       'Missing Description', 'Injuries', 'Injuries Description',
       'Damage ($Mil)', 'Damage Description', 'Houses Destroyed',
       'Houses Destroyed Description', 'Houses Damaged',
       'Houses Damaged Description', 'Total Deaths', 'Total Death Description',
       'Total Missing', 'Total Missing Description', 'Total Injuries',
       'Total Injuries Description', 'Total Damage ($Mil)',
       'Total Damage Description', 'Total Houses Destroyed',
       'Total Houses Destroyed Description', 'Total Houses Damaged',
       'Total Houses Damaged Description'],
 

In [5]:
features_selected = [
    "Year",
    "Mo",
    "Tsunami Event Validity",
    "Earthquake Magnitude",
    "Country",
    "Location Name",
    "Latitude",
    "Longitude",
    "Tsunami Intensity",
    "Total Deaths",
    "Total Damage ($Mil)",
    "Total Houses Destroyed",
    "Total Injuries"
]

filtered_df = (
    tsunami_df
    .loc[tsunami_df["Year"] >= 1500, features_selected]
    .reset_index(drop=True)
)
filtered_df

Unnamed: 0,Year,Mo,Tsunami Event Validity,Earthquake Magnitude,Country,Location Name,Latitude,Longitude,Tsunami Intensity,Total Deaths,Total Damage ($Mil),Total Houses Destroyed,Total Injuries
0,1500.0,7.0,-1.0,,JAPAN,TOKAIDO,,,,,,,
1,1500.0,,2.0,,USA,HAWAII,,,,,,,
2,1504.0,4.0,1.0,,PORTUGAL,LISBON,,,,,,,
3,1508.0,5.0,2.0,7.1,GREECE,ISLAND OF CRETE,35.000,25.500,3.0,,,,
4,1509.0,6.0,2.0,,CHINA,SOUTH CHINA SEA,31.500,121.500,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,2022.0,1.0,4.0,,TONGA,TONGA ISLANDS,-20.536,-175.382,,,,,
2595,2022.0,1.0,4.0,,TONGA,TONGA ISLANDS,-20.536,-175.382,,,,,
2596,2022.0,1.0,4.0,,TONGA,TONGA ISLANDS,-20.536,-175.382,,6.0,96.9,,20.0
2597,2022.0,1.0,4.0,,TONGA,TONGA ISLANDS,-20.536,-175.382,,,,,


In [6]:
filtered_df.columns = ["_".join(c.lower().split(" ")) for c in filtered_df.columns]
filtered_df = filtered_df.rename({
    "total_damage_($mil)": "total_damage_millions",
    "mo": "month"
}, axis=1)

In [7]:
filtered_df["month"] = pd.to_datetime(filtered_df['month'], format='%m').dt.month_name()
filtered_df["year"] = filtered_df["year"].astype(int)


In [8]:
filtered_df.head()

Unnamed: 0,year,month,tsunami_event_validity,earthquake_magnitude,country,location_name,latitude,longitude,tsunami_intensity,total_deaths,total_damage_millions,total_houses_destroyed,total_injuries
0,1500,July,-1.0,,JAPAN,TOKAIDO,,,,,,,
1,1500,,2.0,,USA,HAWAII,,,,,,,
2,1504,April,1.0,,PORTUGAL,LISBON,,,,,,,
3,1508,May,2.0,7.1,GREECE,ISLAND OF CRETE,35.0,25.5,3.0,,,,
4,1509,June,2.0,,CHINA,SOUTH CHINA SEA,31.5,121.5,0.0,,,,


In [9]:
filtered_df.to_csv(PROCESSED_FILE_PATH)