# ETL Project

In [30]:
# imports
import pandas as pd
import numpy as np
import requests
from sqlalchemy import create_engine


In [2]:
## Chris Extract/Transform below


In [3]:
## Stephanie Extract/Transform below

# Extract CSVs into DataFrames
    ### AtlanticStorms from https://www.kaggle.com/noaa/hurricane-database
        #### Each date has up to 5 observations per day (but not all days have 5)
        #### Older data appears to use -999 from wind pressure and speed instead of something like NA
        #### ID: AL = Atlantic, XX = number storm for year, YYYY = year
    ### PacificStorms from https://www.kaggle.com/noaa/hurricane-database
        #### Each date has up to 5 observations per day (but not all days have 5)
        #### Older data appears to use -999 from wind pressure and speed instead of something like NA
        #### ID: EP = Pacific, XX = number storm for year, YYYY = year
    ### 

In [31]:
#Extract Atlantic Storm Data

AtlanticStorm = "Data/Atlantic_Storms.csv"
AtlanticStorm_df = pd.read_csv(AtlanticStorm)
AtlanticStorm_df.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
0,AL011851,UNNAMED,18510625,0,,HU,28.0N,94.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,AL011851,UNNAMED,18510625,600,,HU,28.0N,95.4W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,AL011851,UNNAMED,18510625,1200,,HU,28.0N,96.0W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,AL011851,UNNAMED,18510625,1800,,HU,28.1N,96.5W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,AL011851,UNNAMED,18510625,2100,L,HU,28.2N,96.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [32]:
#Extract Pacific Storm Data

PacificStorm = "Data/Pacific_Storms.csv"
PacificStorm_df = pd.read_csv(PacificStorm)
PacificStorm_df.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
0,EP011949,UNNAMED,19490611,0,,TS,20.2N,106.3W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,EP011949,UNNAMED,19490611,600,,TS,20.2N,106.4W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,EP011949,UNNAMED,19490611,1200,,TS,20.2N,106.7W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,EP011949,UNNAMED,19490611,1800,,TS,20.3N,107.7W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,EP011949,UNNAMED,19490612,0,,TS,20.4N,108.6W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [33]:
# Combine Atlantic and Pacific Storm Data

AtlPacStorms = [AtlanticStorm_df, PacificStorm_df]
AtlPacStorms_df = pd.concat(AtlPacStorms).reset_index(drop=True)
AtlPacStorms_df.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
0,AL011851,UNNAMED,18510625,0,,HU,28.0N,94.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,AL011851,UNNAMED,18510625,600,,HU,28.0N,95.4W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,AL011851,UNNAMED,18510625,1200,,HU,28.0N,96.0W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,AL011851,UNNAMED,18510625,1800,,HU,28.1N,96.5W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,AL011851,UNNAMED,18510625,2100,L,HU,28.2N,96.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [34]:
# Check that Pacific Storms are included in combined df

AtlPacStorms_df.loc[AtlPacStorms_df['ID'] == "EP011949"]

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
49105,EP011949,UNNAMED,19490611,0,,TS,20.2N,106.3W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49106,EP011949,UNNAMED,19490611,600,,TS,20.2N,106.4W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49107,EP011949,UNNAMED,19490611,1200,,TS,20.2N,106.7W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49108,EP011949,UNNAMED,19490611,1800,,TS,20.3N,107.7W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49109,EP011949,UNNAMED,19490612,0,,TS,20.4N,108.6W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49110,EP011949,UNNAMED,19490612,600,,TS,20.5N,109.4W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
49111,EP011949,UNNAMED,19490612,1200,,TS,20.6N,110.2W,45,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [35]:
AtlPacStorms_df.dtypes

ID                  object
Name                object
Date                 int64
Time                 int64
Event               object
Status              object
Latitude            object
Longitude           object
Maximum Wind         int64
Minimum Pressure     int64
Low Wind NE          int64
Low Wind SE          int64
Low Wind SW          int64
Low Wind NW          int64
Moderate Wind NE     int64
Moderate Wind SE     int64
Moderate Wind SW     int64
Moderate Wind NW     int64
High Wind NE         int64
High Wind SE         int64
High Wind SW         int64
High Wind NW         int64
dtype: object

In [36]:
# Adjust date format

# make string version of original Date column, call it 'col'
AtlPacStorms_df['col'] = AtlPacStorms_df['Date'].apply(str)

# make the new columns using string indexing
AtlPacStorms_df['Year'] = AtlPacStorms_df['col'].str[0:4]
AtlPacStorms_df['Month'] = AtlPacStorms_df['col'].str[4:6]
AtlPacStorms_df['Day'] = AtlPacStorms_df['col'].str[6:8]

# get rid of the extra variable (if you want)
AtlPacStorms_df.drop('col', axis=1, inplace=True)

#check result
AtlPacStorms_df.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW,Year,Month,Day
0,AL011851,UNNAMED,18510625,0,,HU,28.0N,94.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,1851,6,25
1,AL011851,UNNAMED,18510625,600,,HU,28.0N,95.4W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,1851,6,25
2,AL011851,UNNAMED,18510625,1200,,HU,28.0N,96.0W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,1851,6,25
3,AL011851,UNNAMED,18510625,1800,,HU,28.1N,96.5W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,1851,6,25
4,AL011851,UNNAMED,18510625,2100,L,HU,28.2N,96.8W,80,-999,...,-999,-999,-999,-999,-999,-999,-999,1851,6,25


In [37]:
#Select columns to keep

AtlPacStorms_df = AtlPacStorms_df[["Year", "Month", "Day", "ID", "Status", "Time", "Latitude", "Longitude"]]
AtlPacStorms_df.head()


Unnamed: 0,Year,Month,Day,ID,Status,Time,Latitude,Longitude
0,1851,6,25,AL011851,HU,0,28.0N,94.8W
1,1851,6,25,AL011851,HU,600,28.0N,95.4W
2,1851,6,25,AL011851,HU,1200,28.0N,96.0W
3,1851,6,25,AL011851,HU,1800,28.1N,96.5W
4,1851,6,25,AL011851,HU,2100,28.2N,96.8W


In [59]:
AtlPacStorms_df["Status"] = AtlPacStorms_df['Status'].astype(str)
AtlPacStorms_df.dtypes

Year         object
Month        object
Day          object
ID           object
Status       object
Time          int64
Latitude     object
Longitude    object
dtype: object

In [64]:
# We are only interested in Hurricanes to only keep rows with Status = HU

AtlPacStorms_df = AtlPacStorms_df.loc[AtlPacStorms_df["Status"] == " HU"]
AtlPacStorms_df.head()

Unnamed: 0,Year,Month,Day,ID,Status,Time,Latitude,Longitude
0,1851,6,25,AL011851,HU,0,28.0N,94.8W
1,1851,6,25,AL011851,HU,600,28.0N,95.4W
2,1851,6,25,AL011851,HU,1200,28.0N,96.0W
3,1851,6,25,AL011851,HU,1800,28.1N,96.5W
4,1851,6,25,AL011851,HU,2100,28.2N,96.8W


In [69]:
# Keep only the first observation of each unique ID

Hurricane_df = AtlPacStorms_df.drop_duplicates(subset=[AtlPacStorms_df.columns[3]], keep = "first")
Hurricane_df.head()

Unnamed: 0,Year,Month,Day,ID,Status,Time,Latitude,Longitude
0,1851,6,25,AL011851,HU,0,28.0N,94.8W
14,1851,7,5,AL021851,HU,1200,22.2N,97.6W
22,1851,8,17,AL041851,HU,1200,15.9N,58.5W
102,1852,8,20,AL011852,HU,0,21.2N,70.6W
143,1852,9,5,AL021852,HU,0,17.0N,64.1W


In [74]:
# Drop Status and Time columns

Hurricane_df = Hurricane_df[["Year", "Month", "Day", "ID", "Latitude", "Longitude"]]
Hurricane_df = Hurricane_df.reset_index(drop = True)
Hurricane_df.head()


Unnamed: 0,Year,Month,Day,ID,Latitude,Longitude
0,1851,6,25,AL011851,28.0N,94.8W
1,1851,7,5,AL021851,22.2N,97.6W
2,1851,8,17,AL041851,15.9N,58.5W
3,1852,8,20,AL011852,21.2N,70.6W
4,1852,9,5,AL021852,17.0N,64.1W


In [4]:
## Chris Load below

In [5]:
## Stephanie Load below