In [581]:
import json
import pandas as pd
import numpy as np
from pprint import pprint
from datetime import datetime, timedelta


import re
from ast import literal_eval

from sqlalchemy import create_engine
import psycopg2

import time

from config import db_password 

In [582]:
file_dir = "Resources"
# Observation data
weather_file = f'../{file_dir}/example_data.json'

#Parameter File
params_file = f'../{file_dir}/params_list.json'

In [583]:

# Open the read the Observation data JSON file.
with open(f'{weather_file}', mode='r') as file:
    weather_raw = json.load(file)  # Load the file into a list of Dictionaries NOT RIGHT TO JSON

with open(f'{params_file}', mode='r') as file:
    params_data = json.load(file)  # Load the file into a list of Dictionaries NOT RIGHT TO JSON



In [584]:
weather_df = pd.DataFrame(weather_raw)

In [585]:
# weather_df.head()

In [586]:
# Move the API parameters we used into a dataframe
# We need the parameters to know the date range and the values in the observation Array
params_df = pd.DataFrame(params_data)

In [587]:
# params_df.head()

In [588]:
# Parse out the JSON data into a Stations Dataframe and an Observations Dataframe
counter = 0
stations_list =[]
observations_list = []

# Convert the start/end date used in the API call stored in the params file
startDate = pd.to_datetime(params_df['sdate'][0])
endDate = pd.to_datetime(params_df['edate'][0])

# Loop through the upper most level of the JSON file called "data"
for observation in weather_raw['data']:

    # Create a list of stations
    stations_list.append(observation["meta"])
        
    # Start the current date at the first date in the range for each stations observations
    currentDate = startDate
    
    # Loop through the stations observations and create a list of observations
    for observation_data in observation["data"]:
        observations_list.append({"station_uid":observation["meta"]['uid'],"date":currentDate, "data":observation_data})

        # Increment the date by one day
        currentDate = currentDate + timedelta(days=1)

        # Increment our counter (for debugging use)
        counter = counter + 1


In [589]:

# Convert our lists to dataframes
stations_df = pd.DataFrame(stations_list)
observations_df = pd.DataFrame(observations_list)


In [590]:
observations_df.head()

Unnamed: 0,station_uid,date,data
0,10373,2002-01-01,"[M, M, M, M, M, M]"
1,10373,2002-01-02,"[M, M, M, M, M, M]"
2,10373,2002-01-03,"[M, M, M, M, M, M]"
3,10373,2002-01-04,"[M, M, M, M, M, M]"
4,10373,2002-01-05,"[M, M, M, M, M, M]"


In [591]:
stations_df.head()

Unnamed: 0,valid_daterange,name,ll,sids,county,state,elev,climdiv,uid,sid_dates
0,"[[1893-01-01, 1893-05-31], [1893-01-01, 1893-0...",NORTHFIELD 2NNE,"[-93.1486, 44.4753]","[215987 2, USC00215987 6, NFDM5 7, NRFM5 7]",27037,MN,890.0,MN08,10373,"[[215987 2, 1941-12-02, 9999-12-31], [215987 2..."
1,"[[], [], [1990-04-01, 2015-07-18], [1990-04-01...",ST FRANCIS,"[-93.3591, 45.3878]","[217309 2, USC00217309 6, SFSM5 7]",27003,MN,900.0,MN06,10674,"[[217309 2, 1990-03-01, 2013-09-01], [USC00217..."
2,"[[1943-05-01, 2022-05-17], [1943-02-06, 2022-0...",ROSEMOUNT RESEARCH AND OUTREACH CENTER,"[-93.09798, 44.71673]","[217107 2, USC00217107 6, RSMM5 7]",27037,MN,945.0,MN09,10395,"[[217107 2, 1950-12-01, 9999-12-31], [217107 2..."
3,"[[1892-04-25, 2022-05-16], [1892-04-26, 2022-0...",FARMINGTON 3NW,"[-93.17559, 44.666]","[212737 2, USC00212737 6, FRMM5 7]",27037,MN,960.0,MN09,10392,"[[212737 2, 2020-11-18, 9999-12-31], [212737 2..."
4,"[[2002-09-01, 2013-04-28], [2002-09-01, 2013-0...",SPRING PARK,"[-93.6275, 44.9346]","[217935 2, USC00217935 6, SPKM5 7]",27053,MN,1016.0,MN06,31444,"[[217935 2, 2000-07-25, 2013-05-01], [USC00217..."


In [592]:
# split the ll list into Latitude and longitude 
split_df = pd.DataFrame(stations_df['ll'].to_list(), columns = ['latitude','longitude'])
# concat df and split_df
stations_df = pd.concat([stations_df, split_df], axis=1)
# Drop ll column.
stations_df = stations_df.drop(columns=["ll"])
# # display df
# stations_df.head()

In [593]:
# create sids columns for the array sids
sids = []
sid_string = ""
counter = 1
for x in range(stations_df.sids.map(len).max()):
  sids.append(f"sid_{counter}")
  sid_string = f"{sid_string}, 'sid_{counter}'"
  counter = counter + 1

# split the array into separate columns
split_df = pd.DataFrame(stations_df['sids'].to_list(), columns = sids)

# concat df and split_df
stations_df = pd.concat([stations_df, split_df], axis=1)

# Drop names from columns.
stations_df = stations_df.drop(columns=["sids"])


In [594]:
# Reorder columns in the DF - Remove the valid_daterange and sid_dates since they are not necessary
column_list = f"['uid','name','county','state','latitude','longitude','climdiv'{sid_string}]"  #'valid_daterange','sid_dates'

# change the string into an array for the reordering
columnArray = literal_eval(column_list)

stations_df= stations_df.loc[:, columnArray]
stations_df.rename(columns={'uid':'station_uid'},inplace=True)


In [595]:
stations_df.head()

Unnamed: 0,station_uid,name,county,state,latitude,longitude,climdiv,sid_1,sid_2,sid_3,sid_4,sid_5,sid_6,sid_7,sid_8
0,10373,NORTHFIELD 2NNE,27037,MN,-93.1486,44.4753,MN08,215987 2,USC00215987 6,NFDM5 7,NRFM5 7,,,,
1,10674,ST FRANCIS,27003,MN,-93.3591,45.3878,MN06,217309 2,USC00217309 6,SFSM5 7,,,,,
2,10395,ROSEMOUNT RESEARCH AND OUTREACH CENTER,27037,MN,-93.09798,44.71673,MN09,217107 2,USC00217107 6,RSMM5 7,,,,,
3,10392,FARMINGTON 3NW,27037,MN,-93.17559,44.666,MN09,212737 2,USC00212737 6,FRMM5 7,,,,,
4,31444,SPRING PARK,27053,MN,-93.6275,44.9346,MN06,217935 2,USC00217935 6,SPKM5 7,,,,,


In [596]:
# Separate the data columns into the elems requested
# create sids columns for the array sids
elems = str(params_df["elems"][0])

elems = elems.replace(",","','")

# convert the elems from the params list to a string for splitting apart the Array 
column_list = f"['{elems}']"
columnArray = literal_eval(column_list)

# split the array into separate columns]
split_df = pd.DataFrame(observations_df['data'].to_list(), columns = columnArray)


In [597]:
# Replace any of the "M" values with NaN in the data.  This is a Missing Value
split_df.replace("M",np.NaN, inplace=True)

In [598]:
# split up the data returned based on the Elements stored in the params json file

# concat df and split_df
observations_df = pd.concat([observations_df, split_df], axis=1)
# Drop data column.
observations_df = observations_df.drop(columns=["data"])

In [599]:
observations_df.count()

station_uid    3266599
date           3266599
maxt            126519
mint            126458
pcpn            525180
snow            325993
snwd            135178
avgt            126064
dtype: int64

In [600]:
# Drop any observations where it's missing a min temperature
observations_df = observations_df.dropna(subset=['mint'])

In [601]:
observations_df.count()

station_uid    126458
date           126458
maxt           126064
mint           126458
pcpn           115504
snow            91125
snwd            85397
avgt           126064
dtype: int64

In [602]:
# change the datatypes of the columns to numeric
observations_df[columnArray] = observations_df[columnArray].apply(pd.to_numeric, errors='coerce', axis=1)

In [603]:
stations_df.count()

station_uid    439
name           439
county         439
state          439
latitude       439
longitude      439
climdiv        439
sid_1          439
sid_2          438
sid_3           29
sid_4            9
sid_5            5
sid_6            4
sid_7            2
sid_8            1
dtype: int64

In [604]:
stations_df = stations_df[stations_df.station_uid.isin(observations_df['station_uid'])]

In [605]:
stations_df.count()

station_uid    27
name           27
county         27
state          27
latitude       27
longitude      27
climdiv        27
sid_1          27
sid_2          26
sid_3          25
sid_4           8
sid_5           5
sid_6           4
sid_7           2
sid_8           1
dtype: int64

In [606]:
# Create a freeze_day column based on min temperature
observations_df['freeze_day'] = np.where(observations_df['mint'] <= 32, 1, 0)

# Flag if it reaches above freezing on the day
observations_df['above_freezing'] = np.where(observations_df['maxt'] > 32, 1, 0)

# Create columns for the different date parts to make processing easier
observations_df['obs_year'] = pd.to_datetime(observations_df['date']).dt.year
observations_df['obs_month'] = pd.to_datetime(observations_df['date']).dt.month
observations_df['obs_day'] = pd.to_datetime(observations_df['date']).dt.day
observations_df['obs_dayofyear'] = pd.to_datetime(observations_df['date']).dt.dayofyear

In [607]:
# create a dataframe to store yearly summary info by station ID
years = pd.to_datetime(observations_df['date']).dt.year.unique()
years_df = pd.DataFrame(years,columns=['year'])
station_yearly_metrics_df = pd.merge(stations_df['station_uid'], years_df, how='cross')
station_yearly_metrics_df.set_index(['station_uid','year'])


# get the last frost date of each station for each year of data
last_freeze_df = observations_df.loc[(observations_df['freeze_day']==1)  & (observations_df['obs_dayofyear'] < 180),['station_uid','date','obs_year','obs_dayofyear'] ]. \
        groupby(["station_uid","obs_year"])[['date','obs_dayofyear']].max().rename(columns={'date':'last_freeze_date','obs_dayofyear':'last_freeze_dayofyear'})
#last_freeze_df.rename(columns={'date':'last_freeze_date','obs_dayofyear':'last_freeze_dayofyear'},inplace=True)

# get the first freeze in the fall
first_freeze_df = observations_df.loc[(observations_df['freeze_day']==1)  & (observations_df['obs_dayofyear'] >= 180),['station_uid','date','obs_year','obs_dayofyear'] ]. \
        groupby(["station_uid","obs_year"])[['date','obs_dayofyear']].min().rename(columns={'date':'first_freeze_date','obs_dayofyear':'first_freeze_dayofyear'})
# first_freeze_df.rename(columns={'date':'first_freeze_date','obs_dayofyear':'first_freeze_dayofyear'},inplace=True)

# Determine if we have a complete set of observations for april to may for each station/year
april_to_may_days_recorderd_df = pd.DataFrame(observations_df.loc[(observations_df['obs_month']>=4 )&(observations_df['obs_month'] <= 6),['station_uid','obs_year','mint']]\
        .groupby(['station_uid','obs_year'])['mint'].count()).rename(columns={'mint':'observations_recorded_april_to_may'})
# april_to_may_days_recorderd_df.rename(columns={'mint':'observations_recorded_april_to_may'},inplace=True)



In [608]:
# merge all the yearly data 
station_yearly_metrics_df = pd.merge(station_yearly_metrics_df, last_freeze_df, how='left', left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
station_yearly_metrics_df = pd.merge(station_yearly_metrics_df, first_freeze_df, how='left', left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])
station_yearly_metrics_df = pd.merge(station_yearly_metrics_df, april_to_may_days_recorderd_df, left_on=['station_uid', 'year'], right_on = ["station_uid","obs_year"])

In [609]:
# Determine the mean/average last freeze date for a station 
avg_last_freeze_df = pd.DataFrame(station_yearly_metrics_df.groupby(['station_uid'])['last_freeze_dayofyear'].mean().round(0)).rename(columns={'last_freeze_dayofyear':'avg_last_freeze_dayofyear'})
# Convert the day of year to a string value for mm/dd
avg_last_freeze_df["avg_last_freeze_mm_dd"] = pd.to_datetime(avg_last_freeze_df["avg_last_freeze_dayofyear"],format='%j').dt.strftime('%m/%d')

# determine the mean, get the string value
median_last_freeze_df = pd.DataFrame(station_yearly_metrics_df.groupby(['station_uid'])['last_freeze_dayofyear'].median().round(0)).rename(columns={'last_freeze_dayofyear':'median_last_freeze_dayofyear'})
median_last_freeze_df["median_last_freeze_mm_dd"] = pd.to_datetime(median_last_freeze_df["median_last_freeze_dayofyear"],format='%j').dt.strftime('%m/%d')

# Merge the values into a single table
station_metrics_df = pd.merge(stations_df, avg_last_freeze_df, left_on=['station_uid'], right_on = ['station_uid'])
station_metrics_df = pd.merge(station_metrics_df, median_last_freeze_df, left_on=['station_uid'], right_on = ['station_uid'])

In [610]:
merged_station_and_yearly_df = pd.merge(station_metrics_df, station_yearly_metrics_df, how="left", on=["station_uid", "station_uid"])

In [611]:
stations_years_count = merged_station_and_yearly_df.groupby("station_uid").count()["year"]

# Calculate the number of years where the last freeze was before or on the average date
stations_count_at_or_before_avg_last_freeze = merged_station_and_yearly_df[(merged_station_and_yearly_df["last_freeze_dayofyear"] <= merged_station_and_yearly_df['avg_last_freeze_dayofyear'])]
stations_count_at_or_before_avg_last_freeze = stations_count_at_or_before_avg_last_freeze.groupby("station_uid").count()["year"]

# Calculate the number of years where the last freeze was after the average date
stations_count_later_than_avg_last_freeze = merged_station_and_yearly_df[(merged_station_and_yearly_df["last_freeze_dayofyear"] > merged_station_and_yearly_df['avg_last_freeze_dayofyear'])]
stations_count_later_than_avg_last_freeze = stations_count_later_than_avg_last_freeze.groupby("station_uid").count()["year"]



In [612]:
station_calc_values_df = pd.DataFrame(
          {"years_included": stations_years_count,
          "count_at_or_before_avg_last_freeze": stations_count_at_or_before_avg_last_freeze, 
          "count_later_than_avg_last_freeze": stations_count_later_than_avg_last_freeze})
station_calc_values_df.head()

Unnamed: 0_level_0,years_included,count_at_or_before_avg_last_freeze,count_later_than_avg_last_freeze
station_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10392,10,5.0,5.0
10393,20,7.0,13.0
10395,21,10.0,11.0
10398,21,12.0,9.0
10401,1,1.0,


In [613]:
station_metrics_full_df = pd.merge(station_metrics_df, station_calc_values_df, how="left", on=["station_uid", "station_uid"])
station_metrics_full_df.head(5)

Unnamed: 0,station_uid,name,county,state,latitude,longitude,climdiv,sid_1,sid_2,sid_3,...,sid_6,sid_7,sid_8,avg_last_freeze_dayofyear,avg_last_freeze_mm_dd,median_last_freeze_dayofyear,median_last_freeze_mm_dd,years_included,count_at_or_before_avg_last_freeze,count_later_than_avg_last_freeze
0,10395,ROSEMOUNT RESEARCH AND OUTREACH CENTER,27037,MN,-93.09798,44.71673,MN09,217107 2,USC00217107 6,RSMM5 7,...,,,,122.0,05/02,123.0,05/03,21,10.0,11.0
1,10392,FARMINGTON 3NW,27037,MN,-93.17559,44.666,MN09,212737 2,USC00212737 6,FRMM5 7,...,,,,117.0,04/27,118.0,04/28,10,5.0,5.0
2,31444,SPRING PARK,27053,MN,-93.6275,44.9346,MN06,217935 2,USC00217935 6,SPKM5 7,...,,,,108.0,04/18,104.0,04/14,11,7.0,4.0
3,10398,HASTINGS DAM 2,27037,MN,-92.8689,44.7597,MN09,213567 2,USC00213567 6,HSTM5 7,...,,,,109.0,04/19,108.0,04/18,21,12.0,9.0
4,10676,MOUND,27053,MN,-93.65,44.95,MN06,215665 2,USC00215665 6,MOUM5 7,...,,,,117.0,04/27,117.0,04/27,2,1.0,1.0


In [615]:
# Create the Output file (CSV
output_station_file = "../Resources/station_df.csv"
output_station_year_file = "../Resources/station_year_data.csv"
output_observation_file = "../Resources/observation_data.csv"

station_metrics_full_df.to_csv(output_station_file, index=False)
station_yearly_metrics_df.to_csv(output_station_year_file, index=False)
observations_df.to_csv(output_observation_file, index=False)



In [None]:
# # Connect to PostgreSQL movie_data DB
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/last_freeze_analysis"

# # Create the database engine with the following line 
# engine = create_engine(db_string)

# # Save the stations DataFrame to a SQL table "stations"- Replace the table if it already exists
# stations_df.to_sql(name='stations', con=engine, if_exists='replace')   

# # Save the observations DataFrame to a SQL table "observations"- Replace the table if it already exists
# observations_df.to_sql(name='observations', con=engine, if_exists='replace')   


In [None]:
# Write to a CSV file