In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn import metrics

In [3]:
data_dir = "../data/"

In [4]:
fish_data = pd.read_csv(f'{data_dir}BroughtonSeaLice_fishData.csv', encoding='ISO-8859-1', low_memory=False)

In [5]:
## Constants and helpers
analysis_years = list(range(2003, 2018))

analysis_months = list(range(1, 7))

dow_dict = {
    1: 'MON',
    2: 'TUE',
    3: 'WED',
    4: 'THU', 
    5: 'FRI', 
    6: 'SAT', 
    7: 'SUN'
}

def get_dow(dt_obj):
    dow_text = dt_obj.isoweekday()
    return(dow_dict[dow_text])

In [6]:
adult = fish_data[['Lep_PAmale', 'Lep_PAfemale', 
                   'Lep_male', 'Lep_gravid',
                   'Lep_nongravid', 'unid_PA',
                   'unid_adult']].sum(axis=1)

fish_data_date = pd.to_datetime(fish_data[['year', 'day', 'month']])

response = pd.DataFrame({'count':adult.values, 
                         'location':fish_data['location'].values,
                         'datetime': fish_data_date})

response_glacier = response[response['location'] == 'Glacier']



year_df_list = []
for year in analysis_years:
    subset = response_glacier[response_glacier['datetime'].dt.year == year]
    subset.loc[0] = np.nan
    subset.loc[0, 'datetime'] = datetime.datetime(year, 1, 1)
    subset.loc[1] = np.nan
    subset.loc[1, 'datetime'] = datetime.datetime(year, 12, 31)
    subset.sort_values('datetime', inplace=True)
    subset_resampled = subset.resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                       on='datetime', label='left').mean().interpolate(methods='linear')
    year_df_list.append(subset_resampled)
Y_glacier = pd.concat(year_df_list).reset_index().set_index('datetime')
Y_glacier = Y_glacier[Y_glacier.index.year != 2002]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [65]:
def seasonal_bayes_model(Y):
    Y = Y_glacier.copy().reset_index()
    Y['year'] = Y['datetime'].dt.year
    Y = Y[~Y['year'].isin([2002, 2004])]
    dummy_date_filter = (Y['datetime'].dt.month == 12) & (Y['datetime'].dt.day == 25)
    leap_date_filter = (Y['datetime'].dt.month == 12) & (Y['datetime'].dt.day == 30)
    Y = Y[~(dummy_date_filter | leap_date_filter)]

    dummy_year_dates = Y[Y['year'] == 2003]['datetime']
    Y['datetime'] = pd.concat([dummy_year_dates] * Y['year'].nunique(), axis=0).values
    return(Y[(3 <= Y['datetime'].dt.month) & (7 >= Y['datetime'].dt.month)].groupby('datetime')['count'].mean().fillna(0))

In [66]:
dummy_dates = Y_glacier[Y_glacier.index.year==2003].reset_index()['datetime'].iloc[:-1]

def get_train_values(train, dummy_year_dates=dummy_dates):
    Y = train.copy().reset_index()
    Y['year'] = Y['datetime'].dt.year
    Y = Y[~Y['year'].isin([2002, 2004])]
    dummy_date_filter = (Y['datetime'].dt.month == 12) & (Y['datetime'].dt.day == 25)
    leap_date_filter = (Y['datetime'].dt.month == 12) & (Y['datetime'].dt.day == 30)
    Y = Y[~(dummy_date_filter | leap_date_filter)]
    Y['datetime'] = pd.concat([dummy_year_dates] * Y['year'].nunique(), axis=0).values
    Y = Y[(3 <= Y['datetime'].dt.month) & (7 >= Y['datetime'].dt.month)]
    
    list_of_years = []
    for year in Y['year'].unique():
        list_of_years.append(Y[Y['year'] == year]['count'].fillna(0).values)
    return(np.array(list_of_years))

In [67]:
def make_train_prediction_assessment_array(preds, is_04=False):
    list_of_years = []
    upper = 13
    if is_04:
        upper = 14
    for year in range(0, upper):
        list_of_years.append(preds.values)
    return(np.array(list_of_years))

In [68]:
bayes_info = {}

for year in analysis_years:
    train_data = Y_glacier[~Y_glacier.index.year.isin([year, 2004])]
    
    test_data = Y_glacier[(3 <= Y_glacier.index.month) & (7 >= Y_glacier.index.month) & (Y_glacier.index.year == year)]['count'].fillna(0)
    
    preds = seasonal_bayes_model(train_data)
    
    # Train MSE
    train_values = get_train_values(train_data)
    train_predictions = make_train_prediction_assessment_array(preds, (year==2004))
    train_mse = metrics.mean_squared_error(train_values, train_predictions)
    train_mae = metrics.mean_absolute_error(train_values, train_predictions)

    # Test MSE
    test_mse = metrics.mean_squared_error(test_data, preds)
    test_mae = metrics.mean_absolute_error(test_data, preds)
    
    year_info = {
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'test_predictions': preds
    }
    
    bayes_info[year] = year_info
    

In [69]:
for key in bayes_info.keys(): 
    bayes_info[key]['test_predictions'] = bayes_info[key]['test_predictions'].values.tolist()

In [70]:
# save to json
import json

with open('sufficient_info.json', 'w') as fp:
    json.dump(bayes_info, fp)

In [71]:
actual_info = {}

for year in analysis_years:
    test_data = Y_glacier[(3 <= Y_glacier.index.month) & (7 >= Y_glacier.index.month) & (Y_glacier.index.year == year)]['count'].fillna(0)
    actual_info[year] = test_data.values.tolist()

In [72]:
with open('actual_values.json', 'w') as fp:
    json.dump(actual_info, fp)