# Bonus: Temperature Analysis I

In [2]:
import pandas as pd
from datetime import datetime as dt

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime\
df["date_obj"] = pd.to_datetime(df["date"], format = "%Y-%m-%d")
df["date_obj"]

0       2010-01-01
1       2010-01-02
2       2010-01-03
3       2010-01-04
4       2010-01-06
           ...    
19545   2017-08-19
19546   2017-08-20
19547   2017-08-21
19548   2017-08-22
19549   2017-08-23
Name: date_obj, Length: 19550, dtype: datetime64[ns]

In [5]:
# Set the date column as the DataFrame index
df_reindex = df.set_index("date_obj")
df_reindex

Unnamed: 0_level_0,station,date,prcp,tobs
date_obj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.00,63
2010-01-03,USC00519397,2010-01-03,0.00,74
2010-01-04,USC00519397,2010-01-04,0.00,76
2010-01-06,USC00519397,2010-01-06,,73
...,...,...,...,...
2017-08-19,USC00516128,2017-08-19,0.09,71
2017-08-20,USC00516128,2017-08-20,,78
2017-08-21,USC00516128,2017-08-21,0.56,76
2017-08-22,USC00516128,2017-08-22,0.50,76


In [6]:
# Drop the date column
df_reindex.drop("date", axis = 1, inplace = True)
df_reindex

Unnamed: 0_level_0,station,prcp,tobs
date_obj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [21]:
# Filter data for desired months
df_2mo_only = df_reindex.loc[(df_reindex.index.strftime("%m") == "06") | (df_reindex.index.strftime("%m") == "12")]
df_2mo_only

Unnamed: 0_level_0,station,prcp,tobs
date_obj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2017-06-26,USC00516128,0.02,79
2017-06-27,USC00516128,0.10,74
2017-06-28,USC00516128,0.02,74
2017-06-29,USC00516128,0.04,76


In [67]:
# Identify the average temperature for June
df_jun_only = df_2mo_only.loc[(df_2mo_only.index.strftime("%m") == "06"),["tobs"]]
df_jun_group = df_jun_only.groupby(df_jun_only.index.month == "06")
round(df_jun_group.mean().iloc[0,0], 2)

74.94

In [68]:
# Identify the average temperature for December
df_dec_only = df_2mo_only.loc[(df_2mo_only.index.strftime("%m") == "12"),["tobs"]]
df_dec_group = df_dec_only.groupby(df_dec_only.index.month == "12")
round(df_dec_group.mean().iloc[0,0], 2)

71.04

In [69]:
# Create collections of temperature data
jun_list = df_jun_only["tobs"].tolist()
dec_list = df_dec_only["tobs"].tolist()


In [77]:
# Run paired t-test
t_stat, p_value = stats.ttest_ind(jun_list, dec_list)
print(f'T-stat: {t_stat}')
print(f'P-value: {p_value}')

T-stat: 31.60372399000329
P-value: 3.9025129038616655e-191


### Analysis

In [80]:
print(f'With a p-value of near zero, the null hypothesis that the differences between the average temperatures between July and December is due to error, can be rejected. This implies that some specific factor has a direct effect on creating this difference (i.e. the season)')

With a p-value of near zero, the null hypothesis that the differences between the average temperatures between July and December is due to error, can be rejected. This implies that some specific factor has a direct effect on creating this difference (i.e. the season)
