# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])

In [4]:
# Set the date column as the DataFrame index
df.set_index('date')

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [5]:
# Drop the date column
df.drop(['date'], axis=1)

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [6]:
from scipy import stats

In [7]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

# create engine to hawaii.sqlite
engine = create_engine("sqlite:///hawaii.sqlite")

# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

# View all of the classes that automap found
Base.classes.keys()

# Save references to each table
Measurement = Base.classes.measurement
Station = Base.classes.station

# Create our session (link) from Python to the DB
session = Session(engine)

In [8]:
# Filter data for desired months
june_temps = session.query(Measurement.tobs).filter(func.strftime("%m", Measurement.date) == "06").all()
june_temps

dec_temps = session.query(Measurement.tobs).filter(func.strftime("%m", Measurement.date) == "12").all()
dec_temps

desired_months_data_df = pd.DataFrame(list(zip(june_temps, dec_temps)),
              columns=['june_temps','dec_temps'])
desired_months_data_df

Unnamed: 0,june_temps,dec_temps
0,(78.0),(76.0)
1,(76.0),(74.0)
2,(78.0),(74.0)
3,(76.0),(64.0)
4,(77.0),(64.0)
...,...,...
1512,(71.0),(71.0)
1513,(73.0),(71.0)
1514,(72.0),(69.0)
1515,(74.0),(65.0)


In [9]:
# Identify the average temperature for June

june_temps = session.query(Measurement.tobs).\
        filter(Measurement.date.like("%-06-%")).all()
june_avg_temp = session.query(func.avg(Measurement.tobs)).\
        filter(Measurement.date.like("%-06-%")).scalar()
print(f"Avg June temp for all stations and all years is: {round(june_avg_temp,2)} F")

Avg June temp for all stations and all years is: 74.94 F


In [10]:
# Identify the average temperature for December

dec_temps = session.query(Measurement.tobs).\
        filter(Measurement.date.like("%-12-%")).all()
dec_avg_temp = session.query(func.avg(Measurement.tobs)).\
        filter(Measurement.date.like("%-12-%")).scalar()
print(f"Avg December temp for all stations and all years is: {round(dec_avg_temp,2)} F")

Avg December temp for all stations and all years is: 71.04 F


In [11]:
# Create collections of temperature data

In [12]:
# Run unpaired t-test
t_test_up = stats.ttest_ind(june_temps,dec_temps, equal_var = False)
t_test_up

Ttest_indResult(statistic=array([31.35503692]), pvalue=array([4.19352984e-187]))

In [13]:
# Close Session
session.close()

### Analysis

Unpaired t-test is used because there are two independent samples. Since the p-value is small and less than 0.05, the difference in means is statistically significant.