## Import the MN Counties and create reporting tables with calculations by county and aggregate rollup

In [234]:
# Dependencies and Setup
import pandas as pd


pd.options.display.max_columns = None


from sqlalchemy import create_engine
from sqlalchemy import inspect
import psycopg2
from config import db_password 

### Connect to DB, create county table if it doesn't exist, load DFs with Observation & Station data

In [235]:
# Connect to PostgreSQL movie_data DB
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/last_freeze_analysis"

# Create the database engine with the following line 
engine = create_engine(db_string)

In [236]:
# Check if the table exists.  If it doesn't create the county table
hasCountyTable = inspect(engine).has_table("county_lookup")
if  hasCountyTable == False:
    # Load County Files
    county_file = "../Resources/mn_county_ref.csv"
    # Read the County Data and store into a Pandas DataFrame
    county_list_df = pd.read_csv(county_file)
    county_list_df.to_sql(name="county_lookup", con=engine, index=False)
else:
    county_list_df = pd.read_sql("SELECT * FROM county_lookup",
                      con=engine)   



In [237]:
# pull in observations table with county name joined data using sql query

observations_df = pd.read_sql(
    "SELECT c.county_name, s.county, o.* \
        FROM observation o \
        INNER JOIN station s \
        ON s.station_uid = o.station_uid  \
        INNER JOIN county_lookup c \
        ON c.county_code = s.county",
    con=engine
)


In [238]:
county_df = pd.read_sql(
    "SELECT c.county_name, s.county, count(distinct(o.station_uid)) as station_count \
        FROM observation o \
        INNER JOIN station s \
        ON s.station_uid = o.station_uid  \
        INNER JOIN county_lookup c \
        ON c.county_code = s.county \
        GROUP BY county_name, s.county",
    con=engine)


In [239]:
station_yearly_metrics_df = pd.read_sql(
    "SELECT station_uid, obs_year, last_freeze_dayofyear, last_freeze_date \
        FROM station_yearly s ",
    con=engine)

station_yearly_metrics_df.head(10)

Unnamed: 0,station_uid,obs_year,last_freeze_dayofyear,last_freeze_date
0,10395,2002,125.0,2002-05-05
1,10395,2003,112.0,2003-04-22
2,10395,2004,136.0,2004-05-15
3,10395,2005,123.0,2005-05-03
4,10395,2006,99.0,2006-04-09
5,10395,2007,105.0,2007-04-15
6,10395,2008,121.0,2008-04-30
7,10395,2009,102.0,2009-04-12
8,10395,2010,130.0,2010-05-10
9,10395,2011,123.0,2011-05-03


## Create the county DF and Rollup DF with the calculations

### County Yearly

In [240]:
# create a dataframe to store yearly summary info by county, we'll use years_df for the aggregate rollup dataset
years = pd.to_datetime(observations_df['date']).dt.year.unique()
years_df = pd.DataFrame(years,columns=['obs_year'])

# County
county_yearly_metrics_df = pd.merge(county_df, years_df, how='cross')
county_yearly_metrics_df = county_yearly_metrics_df.set_index(['county','obs_year'])


In [241]:
# get the last frost date of each county for each year of data
last_freeze_df = observations_df.loc[(observations_df['freeze_day']==1)  & (observations_df['obs_dayofyear'] < 180),['county','date','obs_year','obs_dayofyear'] ]. \
        groupby(["county","obs_year"])[['date','obs_dayofyear']].max().rename(columns={'date':'last_freeze_date','obs_dayofyear':'last_freeze_dayofyear'})

# get the first freeze in the fall
first_freeze_df = observations_df.loc[(observations_df['freeze_day']==1)  & (observations_df['obs_dayofyear'] >= 180),['county','date','obs_year','obs_dayofyear'] ]. \
        groupby(["county","obs_year"])[['date','obs_dayofyear']].min().rename(columns={'date':'first_freeze_date','obs_dayofyear':'first_freeze_dayofyear'})

# Count to Determine if we have a complete set of observations for april to may for each county
april_to_may_days_recorderd_df = pd.DataFrame(observations_df.loc[(observations_df['obs_month']>=4 )&(observations_df['obs_month'] <= 6),['county','obs_year','mint']]\
        .groupby(['county','obs_year'])['mint'].count()).rename(columns={'mint':'observations_recorded_april_to_may'})

In [242]:
# # Get the coldest day of the year
coldest_day_of_year = observations_df.groupby(["county","obs_year"])[['mint']].min().rename(columns={'mint':'coldest_day'})
coldest_day_df = pd.merge(coldest_day_of_year, observations_df, how='left', left_on=['county', 'obs_year','coldest_day'], right_on = ["county","obs_year","mint"])
coldest_day_of_year_df = pd.DataFrame(coldest_day_df.groupby(["county","obs_year",'coldest_day'])['obs_dayofyear'].max())
coldest_day_of_year_df.rename(columns={'obs_dayofyear':'coldest_dayofyear'},inplace=True)
coldest_day_of_year_df = coldest_day_of_year_df.reset_index()
coldest_day_of_year_df = coldest_day_of_year_df.set_index(keys=['county','obs_year'])
county_yearly_metrics_df = pd.merge(county_yearly_metrics_df, coldest_day_of_year_df, how='left', left_index=True, right_index =True)

In [243]:
# Get the hottest day of the year, if there are multiple days with the temperature, use the latest one in the year (the one closest to the next last freeze date the next spring)
hottest_day_of_year = observations_df.groupby(["county","obs_year"])[['maxt']].max().rename(columns={'maxt':'hottest_day'})
hottest_day_df = pd.merge(hottest_day_of_year, observations_df, how='left', left_on=['county', 'obs_year','hottest_day'], right_on = ["county","obs_year","maxt"])
hottest_day_of_year_df = pd.DataFrame(hottest_day_df.groupby(["county","obs_year",'hottest_day'])['obs_dayofyear'].max())
hottest_day_of_year_df.rename(columns={'obs_dayofyear':'hottest_dayofyear'},inplace=True)
hottest_day_of_year_df = hottest_day_of_year_df.reset_index()
hottest_day_of_year_df = hottest_day_of_year_df.set_index(keys=['county','obs_year'])
county_yearly_metrics_df = pd.merge(county_yearly_metrics_df, hottest_day_of_year_df, how='left', left_index=True, right_index =True)
#hottest_day_of_year_df.head()

In [244]:
# merge all the yearly data 
county_yearly_metrics_df = pd.merge(county_yearly_metrics_df, last_freeze_df, how='left', left_on=['county','obs_year'], right_index=True) #,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
county_yearly_metrics_df = pd.merge(county_yearly_metrics_df, first_freeze_df, how='left', left_on=['county','obs_year'], right_index=True) # ,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
county_yearly_metrics_df = pd.merge(county_yearly_metrics_df, april_to_may_days_recorderd_df, how='left', left_on=['county','obs_year'], right_index=True) # ,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])

In [245]:
county_yearly_metrics_df.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,county_name,station_count,coldest_day,coldest_dayofyear,hottest_day,hottest_dayofyear,last_freeze_date,last_freeze_dayofyear,first_freeze_date,first_freeze_dayofyear,observations_recorded_april_to_may
county,obs_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
27003,2002,Anoka County,4,-8.0,62.0,92.0,181.0,2002-05-20,140.0,2002-10-07,280.0,89.0
27003,2003,Anoka County,4,-19.0,38.0,96.0,236.0,2003-04-22,112.0,2003-09-29,272.0,104.0
27003,2004,Anoka County,4,-29.0,30.0,96.0,203.0,2004-05-15,136.0,2004-09-07,251.0,181.0
27003,2005,Anoka County,4,-22.0,17.0,96.0,174.0,2005-05-16,136.0,2005-10-07,280.0,182.0
27003,2006,Anoka County,4,-17.0,49.0,100.0,212.0,2006-05-22,142.0,2006-10-05,278.0,92.0
...,...,...,...,...,...,...,...,...,...,...,...,...
27123,2013,Ramsey County,5,-18.0,364.0,97.0,238.0,2013-05-23,143.0,2013-10-20,293.0,364.0
27123,2014,Ramsey County,5,-24.0,6.0,91.0,188.0,2014-04-25,115.0,2014-10-09,282.0,364.0
27123,2015,Ramsey County,5,-17.0,13.0,94.0,226.0,2015-04-24,114.0,2015-10-16,289.0,364.0
27123,2016,Ramsey County,5,-20.0,354.0,96.0,204.0,2016-05-15,136.0,2016-10-24,298.0,364.0


In [246]:
# Move our data columns out of the index
county_yearly_metrics_df.reset_index(inplace=True)

## Start of County Metrics

In [247]:
# Determine the mean/average last freeze date for a station 
avg_last_freeze_df = pd.DataFrame(county_yearly_metrics_df.groupby(['county'])['last_freeze_dayofyear'].mean().round(0)).rename(columns={'last_freeze_dayofyear':'avg_last_freeze_dayofyear'})

# Convert the day of year to a string value for mm/dd
avg_last_freeze_df["avg_last_freeze_mm_dd"] = pd.to_datetime(avg_last_freeze_df["avg_last_freeze_dayofyear"],format='%j').dt.strftime('%m/%d')

# determine the mean, get the string value
median_last_freeze_df = pd.DataFrame(county_yearly_metrics_df.groupby(['county'])['last_freeze_dayofyear'].median().round(0)).rename(columns={'last_freeze_dayofyear':'median_last_freeze_dayofyear'})
median_last_freeze_df["median_last_freeze_mm_dd"] = pd.to_datetime(median_last_freeze_df["median_last_freeze_dayofyear"],format='%j').dt.strftime('%m/%d')

# Merge the values into a single table
county_metrics_df = pd.merge(county_df, avg_last_freeze_df, left_on=['county'], right_on = ['county'])
county_metrics_df = pd.merge(county_metrics_df, median_last_freeze_df, left_on=['county'], right_on = ['county'])

In [248]:
# merge the station metrics and station/yearly DF to determine metrics for each station
merged_county_and_yearly_df = pd.merge(county_yearly_metrics_df, county_metrics_df, how="left", on=["county", "county"])

In [249]:
merged_county_and_yearly_df.head()

Unnamed: 0,county,obs_year,county_name_x,station_count_x,coldest_day,coldest_dayofyear,hottest_day,hottest_dayofyear,last_freeze_date,last_freeze_dayofyear,first_freeze_date,first_freeze_dayofyear,observations_recorded_april_to_may,county_name_y,station_count_y,avg_last_freeze_dayofyear,avg_last_freeze_mm_dd,median_last_freeze_dayofyear,median_last_freeze_mm_dd
0,27003,2002,Anoka County,4,-8.0,62.0,92.0,181.0,2002-05-20,140.0,2002-10-07,280.0,89.0,Anoka County,4,131.0,05/11,136.0,05/16
1,27003,2003,Anoka County,4,-19.0,38.0,96.0,236.0,2003-04-22,112.0,2003-09-29,272.0,104.0,Anoka County,4,131.0,05/11,136.0,05/16
2,27003,2004,Anoka County,4,-29.0,30.0,96.0,203.0,2004-05-15,136.0,2004-09-07,251.0,181.0,Anoka County,4,131.0,05/11,136.0,05/16
3,27003,2005,Anoka County,4,-22.0,17.0,96.0,174.0,2005-05-16,136.0,2005-10-07,280.0,182.0,Anoka County,4,131.0,05/11,136.0,05/16
4,27003,2006,Anoka County,4,-17.0,49.0,100.0,212.0,2006-05-22,142.0,2006-10-05,278.0,92.0,Anoka County,4,131.0,05/11,136.0,05/16


In [250]:
# Get a count of how many years the station is in the dataset
county_years_count = merged_county_and_yearly_df.groupby("county").count()["obs_year"]

county_years_count.head()

county
27003    21
27019    21
27037    21
27053    21
27123    21
Name: obs_year, dtype: int64

In [251]:
# Calculate the number of years where the last freeze was before or on the average date
county_count_at_or_before_avg_last_freeze = merged_county_and_yearly_df[(merged_county_and_yearly_df["last_freeze_dayofyear"] <= merged_county_and_yearly_df['avg_last_freeze_dayofyear'])]
county_count_at_or_before_avg_last_freeze = county_count_at_or_before_avg_last_freeze.groupby("county").count()["obs_year"]

In [252]:
# Calculate the number of years where the last freeze was after the average date
county_count_later_than_avg_last_freeze = merged_county_and_yearly_df[(merged_county_and_yearly_df["last_freeze_dayofyear"] > merged_county_and_yearly_df['avg_last_freeze_dayofyear'])]
county_count_later_than_avg_last_freeze = county_count_later_than_avg_last_freeze.groupby("county").count()["obs_year"]

In [253]:
county_calc_values_df = pd.DataFrame(
          {"years_included": county_years_count,
          "count_at_or_before_avg_last_freeze": county_count_at_or_before_avg_last_freeze, 
          "count_later_than_avg_last_freeze": county_count_later_than_avg_last_freeze})
county_calc_values_df.head(20)

Unnamed: 0_level_0,years_included,count_at_or_before_avg_last_freeze,count_later_than_avg_last_freeze
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27003,21,9,12
27019,21,10,11
27037,21,11,10
27053,21,12,9
27123,21,12,9
27139,21,7,13
27163,21,9,12


In [254]:
county_metrics_full_df = pd.merge(county_metrics_df, county_calc_values_df, how="left", on=["county", "county"])

In [255]:
county_metrics_full_df.head(20)

Unnamed: 0,county_name,county,station_count,avg_last_freeze_dayofyear,avg_last_freeze_mm_dd,median_last_freeze_dayofyear,median_last_freeze_mm_dd,years_included,count_at_or_before_avg_last_freeze,count_later_than_avg_last_freeze
0,Anoka County,27003,4,131.0,05/11,136.0,05/16,21,9,12
1,Carver County,27019,4,122.0,05/02,123.0,05/03,21,10,11
2,Dakota County,27037,4,123.0,05/03,123.0,05/03,21,11,10
3,Hennepin County,27053,7,124.0,05/04,121.0,05/01,21,12,9
4,Ramsey County,27123,5,120.0,04/30,118.0,04/28,21,12,9
5,Scott County,27139,1,118.0,04/28,122.0,05/02,21,7,13
6,Washington County,27163,2,114.0,04/24,118.0,04/28,21,9,12


## Start of Aggregate Rollup

### Yearly

In [256]:
agg_years_df = years_df.set_index(['obs_year'])

# get the last frost date of each county for each year of data
last_freeze_df = merged_county_and_yearly_df.groupby(["obs_year"])[['last_freeze_date','last_freeze_dayofyear']].max()

# # get the first freeze in the fall
first_freeze_df = merged_county_and_yearly_df.groupby(["obs_year"])[['first_freeze_date','first_freeze_dayofyear']].min()

# # Count to Determine if we have a complete set of observations for april to may for each county
april_to_may_days_recorderd_df = merged_county_and_yearly_df.groupby(['obs_year'])[['observations_recorded_april_to_may']].sum()

In [257]:
# # Get the coldest day of the year
coldest_day_of_year = merged_county_and_yearly_df.groupby(["obs_year"])[['coldest_day']].min()
coldest_day_df = pd.merge(coldest_day_of_year, merged_county_and_yearly_df, how='left', left_on=['obs_year','coldest_day'], right_on = ["obs_year","coldest_day"])
coldest_day_of_year_df = pd.DataFrame(coldest_day_df.groupby(["obs_year"])['coldest_dayofyear'].max())
coldest_day_of_year_df = coldest_day_of_year_df.reset_index()
coldest_day_of_year_df = coldest_day_of_year_df.set_index(keys=['obs_year'])
agg_years_df = pd.merge(agg_years_df, coldest_day_of_year_df, how='left', left_index=True, right_index =True)

agg_years_df.head(30)

Unnamed: 0_level_0,coldest_dayofyear
obs_year,Unnamed: 1_level_1
2002,35.0
2003,38.0
2004,35.0
2005,353.0
2006,21.0
2007,36.0
2008,358.0
2009,16.0
2010,4.0
2011,21.0


In [258]:
# # Get the hottest day of the year
hottest_day_of_year = merged_county_and_yearly_df.groupby(["obs_year"])[['hottest_day']].max()
hottest_day_df = pd.merge(hottest_day_of_year, merged_county_and_yearly_df, how='left', left_on=['obs_year','hottest_day'], right_on = ["obs_year","hottest_day"])
hottest_day_of_year_df = pd.DataFrame(coldest_day_df.groupby(["obs_year"])['hottest_dayofyear'].min())
hottest_day_of_year_df = hottest_day_of_year_df.reset_index()
hottest_day_of_year_df = hottest_day_of_year_df.set_index(keys=['obs_year'])
agg_years_df = pd.merge(agg_years_df, hottest_day_of_year_df, how='left', left_index=True, right_index =True)

agg_years_df.head(30)


Unnamed: 0_level_0,coldest_dayofyear,hottest_dayofyear
obs_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2002,35.0,182.0
2003,38.0,237.0
2004,35.0,203.0
2005,353.0,174.0
2006,21.0,212.0
2007,36.0,207.0
2008,358.0,194.0
2009,16.0,140.0
2010,4.0,144.0
2011,21.0,159.0


In [259]:
# merge all the yearly data 
agg_years_df = pd.merge(agg_years_df, last_freeze_df, how='left', left_on=['obs_year'], right_index=True) #,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
agg_years_df = pd.merge(agg_years_df, first_freeze_df, how='left', left_on=['obs_year'], right_index=True) # ,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
agg_years_df = pd.merge(agg_years_df, april_to_may_days_recorderd_df, how='left', left_on=['obs_year'], right_index=True) # ,  left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])

In [260]:
agg_years_df.head(100)

Unnamed: 0_level_0,coldest_dayofyear,hottest_dayofyear,last_freeze_date,last_freeze_dayofyear,first_freeze_date,first_freeze_dayofyear,observations_recorded_april_to_may
obs_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002,35.0,182.0,2002-05-20,140.0,2002-09-24,267.0,1463.0
2003,38.0,237.0,2003-04-22,112.0,2003-09-25,268.0,1653.0
2004,35.0,203.0,2004-05-15,136.0,2004-09-07,251.0,1606.0
2005,353.0,174.0,2005-05-16,136.0,2005-10-07,280.0,1539.0
2006,21.0,212.0,2006-05-22,142.0,2006-10-05,278.0,1483.0
2007,36.0,207.0,2007-04-19,109.0,2007-09-12,255.0,1648.0
2008,358.0,194.0,2008-05-22,143.0,2008-10-01,275.0,1678.0
2009,16.0,140.0,2009-05-18,138.0,2009-09-29,272.0,1658.0
2010,4.0,144.0,2010-05-10,130.0,2010-09-26,269.0,1704.0
2011,21.0,159.0,2011-05-05,125.0,2011-09-15,258.0,1630.0


In [261]:
# Determine the mean/average last freeze date for a station 
avg_last_freeze_df = pd.DataFrame(station_yearly_metrics_df.groupby(['obs_year'])['last_freeze_dayofyear'].mean().round(0))\
                        .rename(columns={'last_freeze_dayofyear':'avg_last_freeze_dayofyear'})

# Convert the day of year
avg_last_freeze_df["avg_last_freeze_date"] = pd.to_datetime(pd.to_datetime(avg_last_freeze_df["avg_last_freeze_dayofyear"],format='%j')\
                                             .dt.strftime('%m/%d').astype(str) + '/' + avg_last_freeze_df.index.astype(str))

# median_last_freeze_df = pd.DataFrame(station_yearly_metrics_df.groupby(['obs_year'])['last_freeze_dayofyear'].median().round(0)).rename(columns={'last_freeze_dayofyear':'median_last_freeze_dayofyear'})
# median_last_freeze_df["median_last_freeze_mm_dd"] = pd.to_datetime(median_last_freeze_df["median_last_freeze_dayofyear"],format='%j').dt.strftime('%m/%d')

median_last_freeze_df = pd.DataFrame(station_yearly_metrics_df.groupby(['obs_year'])['last_freeze_dayofyear'].median().round(0)).\
                           rename(columns={'last_freeze_dayofyear':'median_last_freeze_dayofyear'})

# # Convert the day of year
median_last_freeze_df["median_last_freeze_date"] = pd.to_datetime(pd.to_datetime(median_last_freeze_df["median_last_freeze_dayofyear"],format='%j')\
                                                     .dt.strftime('%m/%d').astype(str) + '/' + median_last_freeze_df.index.astype(str))




In [262]:
station_count_df = pd.DataFrame(station_yearly_metrics_df.groupby(['obs_year'])['station_uid'].nunique()).rename(columns={'station_uid':'station_count'})


In [263]:
# Merge the values into a single table
agg_yearly_metrics_df = pd.merge(agg_years_df, avg_last_freeze_df, how='left', left_index=True, right_index=True)
agg_yearly_metrics_df = pd.merge(agg_yearly_metrics_df, median_last_freeze_df, how='left', left_index=True, right_index=True)
agg_yearly_metrics_df = pd.merge(agg_yearly_metrics_df, station_count_df, how='left', left_index=True, right_index=True)


In [264]:
# Move our data columns out of the index
agg_yearly_metrics_df.reset_index(inplace=True)

In [265]:
agg_yearly_metrics_df.head(100)

Unnamed: 0,obs_year,coldest_dayofyear,hottest_dayofyear,last_freeze_date,last_freeze_dayofyear,first_freeze_date,first_freeze_dayofyear,observations_recorded_april_to_may,avg_last_freeze_dayofyear,avg_last_freeze_date,median_last_freeze_dayofyear,median_last_freeze_date,station_count
0,2002,35.0,182.0,2002-05-20,140.0,2002-09-24,267.0,1463.0,130.0,2002-05-10,125.0,2002-05-05,27
1,2003,38.0,237.0,2003-04-22,112.0,2003-09-25,268.0,1653.0,105.0,2003-04-15,107.0,2003-04-17,27
2,2004,35.0,203.0,2004-05-15,136.0,2004-09-07,251.0,1606.0,122.0,2004-05-02,118.0,2004-04-28,27
3,2005,353.0,174.0,2005-05-16,136.0,2005-10-07,280.0,1539.0,115.0,2005-04-25,124.0,2005-05-04,27
4,2006,21.0,212.0,2006-05-22,142.0,2006-10-05,278.0,1483.0,101.0,2006-04-11,99.0,2006-04-09,27
5,2007,36.0,207.0,2007-04-19,109.0,2007-09-12,255.0,1648.0,104.0,2007-04-14,105.0,2007-04-15,27
6,2008,358.0,194.0,2008-05-22,143.0,2008-10-01,275.0,1678.0,121.0,2008-05-01,120.0,2008-04-30,27
7,2009,16.0,140.0,2009-05-18,138.0,2009-09-29,272.0,1658.0,105.0,2009-04-15,101.0,2009-04-11,27
8,2010,4.0,144.0,2010-05-10,130.0,2010-09-26,269.0,1704.0,122.0,2010-05-02,129.0,2010-05-09,27
9,2011,21.0,159.0,2011-05-05,125.0,2011-09-15,258.0,1630.0,114.0,2011-04-24,123.0,2011-05-03,27


In [266]:
# Create the Output file (CSV)
output_county_yearly_file = "../Resources/county_yearly_metrics_data.csv"
ouptut_county_file = "../Resources/county_metrics_data.csv"
output_county_lookup = "../Resources/county_lookup.csv"
output_all_stations_yearly_file = "../Resources/all_stations_yearly_metrics_data.csv"


county_yearly_metrics_df.to_csv(output_county_yearly_file, index=False)
county_metrics_full_df.to_csv(ouptut_county_file, index=False)
county_list_df.to_csv(output_county_lookup, index=False)
agg_yearly_metrics_df.to_csv(output_all_stations_yearly_file, index=False)

In [267]:
# RECREATE THE TABLES WITH DATA.  CREATE IN THIS ORDER TO DEAL WITH FK's
# Save the observations DataFrame to a SQL table "observations"- Replace the table if it already exists

county_yearly_metrics_df.to_sql(name='county_yearly_metrics', con=engine, if_exists='replace', index=False)   

with engine.connect() as con:
    con.execute("ALTER TABLE county_yearly_metrics ADD PRIMARY KEY (county,obs_year);")


county_metrics_full_df.to_sql(name='county_metrics', con=engine, if_exists='replace', index=False)   

with engine.connect() as con:
    con.execute("ALTER TABLE county_metrics ADD PRIMARY KEY (county);")

agg_yearly_metrics_df.to_sql(name='all_stations_yearly_metrics', con=engine, if_exists='replace', index=False)   

with engine.connect() as con:
    con.execute("ALTER TABLE all_stations_yearly_metrics ADD PRIMARY KEY (obs_year);")