# NOAA Weather Data (Obtaining, Scrubbing, and Exploring)

## Obtaining the Data

In [1]:
#Importing libraries needed
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
%matplotlib inline
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Data Understanding 

NOAA Data Column Descriptions   
|Column Name | Description | 
|---|---|
| NAME| is the name of the station (usually city/airport name). 
|LATITUDE | latitude (decimated degrees w/northern hemisphere values > 0, southern hemisphere values < 0)
| LONGITUDE |  longitude (decimated degrees w/western hemisphere values < 0, eastern hemisphere values > 0) 
| DATE |is the year of the record (4 digits) followed by month (2 digits) and day (2 digits).
|AWND |Average daily wind speed (meters per second) |    
|WSF2 |  Fastest 2-minute wind speed (in meters per second) |  
|WSF5 |  Fastest 5-second wind speed (in meter per second)  |       

In [2]:
#opening the datasets
matthew = pd.read_csv(r'data\matthew.csv')
irma =  pd.read_csv(r'data\irma.csv')
michael =  pd.read_csv(r'data\michael.csv')
charley = pd.read_csv(r'data\charley.csv')
dennis = pd.read_csv(r'data\dennis.csv')

In [3]:
dennis.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,DATE,AWND,WSF2,WSF5
0,"FORT PIERCE, FL US",27.4419,-80.3508,7/10/2005,,,
1,"FORT PIERCE, FL US",27.4419,-80.3508,7/11/2005,,,
2,"FORT PIERCE ARC, FL US",27.4272,-80.4053,7/10/2005,,,
3,"FORT PIERCE ARC, FL US",27.4272,-80.4053,7/11/2005,,,
4,"BIG CYPRESS, FL US",26.3283,-80.9958,7/10/2005,,,


## Data Engineering 
We want to be able to compare data from all 5 hurricanes against home value. So we are going to concat them all into one data frame. 

### Concating the Hurricane Dataframes

In [4]:
#creating a column with a label for each hurricane 
#this way we can still know which hurricane we are referencing

#1 = charley
charley['HurricaneName'] = '1'

#2 = dennis 
dennis['HurricaneName'] = '2'

#3 = matthew 
matthew['HurricaneName'] = '3'

#4 = irma 
irma['HurricaneName'] = '4'

#5 michael 
michael['HurricaneName'] = '5'

In [5]:
charley.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,DATE,AWND,WSF2,WSF5,HurricaneName
0,"FORT PIERCE, FL US",27.4419,-80.3508,8/13/2004,,,,1
1,"FORT PIERCE, FL US",27.4419,-80.3508,8/14/2004,,,,1
2,"FORT PIERCE ARC, FL US",27.4272,-80.4053,8/13/2004,,,,1
3,"FORT PIERCE ARC, FL US",27.4272,-80.4053,8/14/2004,,,,1
4,"BIG CYPRESS, FL US",26.3283,-80.9958,8/13/2004,,,,1


In [6]:
#concating the three dataframes into one 
hurricane = pd.concat([charley, dennis, matthew, irma, michael], ignore_index=True)
hurricane.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,DATE,AWND,WSF2,WSF5,HurricaneName
0,"FORT PIERCE, FL US",27.4419,-80.3508,8/13/2004,,,,1
1,"FORT PIERCE, FL US",27.4419,-80.3508,8/14/2004,,,,1
2,"FORT PIERCE ARC, FL US",27.4272,-80.4053,8/13/2004,,,,1
3,"FORT PIERCE ARC, FL US",27.4272,-80.4053,8/14/2004,,,,1
4,"BIG CYPRESS, FL US",26.3283,-80.9958,8/13/2004,,,,1


In [7]:
#data types are object and float 
#currently have 5571 entries 
#AWND is missing a lot of values 
hurricane.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5571 entries, 0 to 5570
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   NAME           5571 non-null   object 
 1   LATITUDE       5571 non-null   float64
 2   LONGITUDE      5571 non-null   float64
 3   DATE           5571 non-null   object 
 4   AWND           501 non-null    float64
 5   WSF2           508 non-null    float64
 6   WSF5           500 non-null    float64
 7   HurricaneName  5571 non-null   object 
dtypes: float64(5), object(3)
memory usage: 348.3+ KB


### Scrubbing the Data

In [8]:
#AWND is missing 5070 
#We are going to drop any rows where wind speed is missing
#Dropping is the best solution here because wind speed 
#Will not be accurately reflected by the mean in cities hit by the hurricane 
hurricane.isnull().sum()

NAME                0
LATITUDE            0
LONGITUDE           0
DATE                0
AWND             5070
WSF2             5063
WSF5             5071
HurricaneName       0
dtype: int64

In [9]:
#dropping all rows with missing values 
hurricane.dropna(inplace= True)

In [10]:
#checking that dataframe is clean 
hurricane.isnull().sum()

NAME             0
LATITUDE         0
LONGITUDE        0
DATE             0
AWND             0
WSF2             0
WSF5             0
HurricaneName    0
dtype: int64

### Data Engingeering 

#### Getting Coordinates

In [11]:
#Engingeering a coordinates column out of latitude and longitude 
#We need coordinates to use geopy 

hurricane['COORD'] = list(zip(hurricane.LATITUDE, hurricane.LONGITUDE))

#Dropping the Latitude and Longitude column 
hurricane.drop(['LATITUDE', 'LONGITUDE'], axis =1, inplace = True)

hurricane.head()

Unnamed: 0,NAME,DATE,AWND,WSF2,WSF5,HurricaneName,COORD
10,"JACKSONVILLE INTERNATIONAL AIRPORT, FL US",8/13/2004,5.59,21.0,28.0,1,"(30.49529, -81.69374)"
11,"JACKSONVILLE INTERNATIONAL AIRPORT, FL US",8/14/2004,6.49,19.9,23.0,1,"(30.49529, -81.69374)"
18,"CRESTVIEW FAA AP, FL US",8/13/2004,7.16,13.0,17.0,1,"(30.77715, -86.51938)"
19,"CRESTVIEW FAA AP, FL US",8/14/2004,5.59,13.0,16.1,1,"(30.77715, -86.51938)"
30,"MARIANNA MUNICIPAL AIRPORT, FL US",8/13/2004,6.93,13.0,16.1,1,"(30.83696, -85.18352)"


#### Using Geopy to Get Cities
Credit to: https://www.geeksforgeeks.org/get-the-city-state-and-country-names-from-latitude-and-longitude-using-python/

In order to join the hurricane dataframe to the housing dataframe we will need to know the city names. Using the coordinates provided by the NOAA dataset we can use geopy to reverse geolocate the city names. 

Due to this being an API not all request could be completed and some city names had to be annoitated in excel. 

In [12]:
#importing libraries 
from tkinter import *
from geopy.geocoders import Nominatim
from geopy.geocoders import Photon

# Create an instance of tkinter frame
win = Tk()

# Define geometry of the window
win.geometry("700x350")

''

In [13]:
#creating a function 
def get_city(coords):
    #instantiate the Nominatim API 
    geolocator = Nominatim(user_agent="MyApp")
    #get the city from the coordinates 
    location = geolocator.reverse(coords)
    address = location.raw['address']
    city = address.get('city', '')
    #return the city 
    return city

In [14]:
#applying function to dataframe
hurricane['City'] = hurricane['COORD'].apply(get_city)

In [15]:
#looks good 
hurricane.isnull().sum()

NAME             0
DATE             0
AWND             0
WSF2             0
WSF5             0
HurricaneName    0
COORD            0
City             0
dtype: int64

#### Saving the Dataframe 

In [17]:
hurricane.to_csv(r'data\hurricane.csv', index=False)

# Zillow top-tier and bottom-tier ZHVI 

Zillow publishes top-tier ZHVI ($, typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI ($, typical value for homes within the 5th to 35th percentile range for a given region).

I am interested in looking at this data as well because often during a hurricane insurance companies force homes to upgrade there existing features. I would like to see how this impacts home value. 

In [None]:
Zillow Bottom Tier