# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('../Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])

In [4]:
# Set the date column as the DataFrame index
df = df.set_index('date')

In [5]:
# Drop the station column
df = df.drop('station', axis=1)

In [6]:
df.head()

Unnamed: 0_level_0,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,0.08,65
2010-01-02,0.0,63
2010-01-03,0.0,74
2010-01-04,0.0,76
2010-01-06,,73


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [8]:
# Filter data for desired months
june_df = df[df.index.get_level_values('date').month == 6]
dec_df = df[df.index.get_level_values('date').month == 12]

In [16]:
# Identify the average temperature for June
avg_jun = june_df.groupby(june_df.index.get_level_values('date').year).mean()
avg_jun

Unnamed: 0_level_0,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,0.042241,74.92562
2011,0.240142,73.938326
2012,0.097062,74.0
2013,0.144195,74.599078
2014,0.124372,75.027907
2015,0.12516,74.990148
2016,0.212312,75.175258
2017,0.12,77.219895


In [17]:
# Identify the average temperature for December
avg_dec = dec_df.groupby(dec_df.index.get_level_values('date').year).mean()
avg_dec

Unnamed: 0_level_0,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,0.459087,70.208511
2011,0.201581,70.820628
2012,0.089604,71.188073
2013,0.169014,71.094017
2014,0.188439,69.896861
2015,0.169506,73.423913
2016,0.199494,71.13


In [21]:
# Create collections of temperature data
all_temps = avg_dec.merge(avg_jun, how = 'outer', left_index= True, right_index = True)
all_temps

Unnamed: 0_level_0,prcp_x,tobs_x,prcp_y,tobs_y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,0.459087,70.208511,0.042241,74.92562
2011,0.201581,70.820628,0.240142,73.938326
2012,0.089604,71.188073,0.097062,74.0
2013,0.169014,71.094017,0.144195,74.599078
2014,0.188439,69.896861,0.124372,75.027907
2015,0.169506,73.423913,0.12516,74.990148
2016,0.199494,71.13,0.212312,75.175258
2017,,,0.12,77.219895


In [24]:
# Clean all_temps to leave tobs_jun and tobs_dec
all_temps_drop = all_temps.drop(['prcp_x','prcp_y'], axis = 1)

In [26]:
all_temps_clen = all_temps_drop.rename(columns = {'tobs_x':'tobs_dec', 'tobs_y':'tobs_jun'})
all_temps_clen

Unnamed: 0_level_0,tobs_dec,tobs_jun
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,70.208511,74.92562
2011,70.820628,73.938326
2012,71.188073,74.0
2013,71.094017,74.599078
2014,69.896861,75.027907
2015,73.423913,74.990148
2016,71.13,75.175258
2017,,77.219895


* Since the two samples are independent of the other sample, we use the Unpaired t-test

In [33]:
# Run unpaired t-test
unpair_t_test = stats.ttest_ind(all_temps_clen['tobs_dec'], all_temps_clen['tobs_jun'], nan_policy = 'omit')
p_value = unpair_t_test[1]
p_value

9.681543069110466e-06

### Analysis

Since our p_value of 9.681543069110466e-06 is very loew, we can reject the null hypothesis which means that the is no statistically significant difference between the avergae weather on June and December