## Data imports

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [4]:
import os

In [5]:
data_dir = "../data/"

In [6]:
fish_data = pd.read_csv(f'{data_dir}BroughtonSeaLice_fishData.csv', encoding='ISO-8859-1', low_memory=False)
site_data = pd.read_csv(f'{data_dir}BroughtonSeaLice_siteData.csv', encoding='ISO-8859-1', low_memory=False)
industry_data = pd.read_csv(f'{data_dir}IndustrySeaLice_Data.csv', encoding='ISO-8859-1', low_memory=False)

In [7]:
## Constants and helpers
analysis_years = list(range(2003, 2018))

analysis_months = list(range(1, 7))

dow_dict = {
    1: 'MON',
    2: 'TUE',
    3: 'WED',
    4: 'THU', 
    5: 'FRI', 
    6: 'SAT', 
    7: 'SUN'
}

month_map = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

def get_dow(dt_obj):
    dow_text = dt_obj.isoweekday()
    return(dow_dict[dow_text])

wild_locations = site_data['location'].unique()
wild_locations = ['Glacier']

def split_last_n_by_grain(df, n):
    df_grouped = df.sort_values('datetime').groupby('year', group_keys=False)
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])
    return df_head, df_tail

In [8]:
## Unified adult count

In [9]:
adult = fish_data[['Lep_PAmale', 'Lep_PAfemale', 
                   'Lep_male', 'Lep_gravid',
                   'Lep_nongravid', 'unid_PA',
                   'unid_adult']].sum(axis=1)

fish_data_date = pd.to_datetime(fish_data[['year', 'day', 'month']])

response = pd.DataFrame({'count':adult.values, 
                         'location':fish_data['location'].values,
                         'datetime': fish_data_date})

response_glacier = response[response['location'] == 'Glacier']



year_df_list = []
for year in analysis_years:
    subset = response_glacier[response_glacier['datetime'].dt.year == year]
    subset.loc[0] = np.nan
    subset.loc[0, 'datetime'] = datetime.datetime(year, 1, 1)
    subset.loc[1] = np.nan
    subset.loc[1, 'datetime'] = datetime.datetime(year, 12, 31)
    subset.sort_values('datetime', inplace=True)
    subset_resampled = subset.resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                       on='datetime', label='left').mean().interpolate(methods='linear')
    year_df_list.append(subset_resampled)
Y_glacier = pd.concat(year_df_list).reset_index().set_index('datetime')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
## Non-motile lice
juvenile = pd.DataFrame(fish_data[['Lep_cope', 'chalA',
                      'chalB', 'Caligus_cope',
                      'unid_cope', 'chal_unid']].sum(axis=1)).rename({0: 'count'}, axis=1)
juvenile['datetime'] = fish_data_date
juvenile['location'] = fish_data['location']

juvenile = juvenile[juvenile['location'] == 'Glacier']

year_juv_list = []
for year in analysis_years:
    subset = juvenile[juvenile['datetime'].dt.year == year]
    for loc in wild_locations:
        subset = subset.append({
            'datetime': datetime.datetime(year, 1 , 1),
            'location': loc,
            'count': np.nan
        }, ignore_index=True)
        subset = subset.append({
            'datetime': datetime.datetime(year, 12 , 31),
            'location': loc,
            'count': np.nan
        }, ignore_index=True)
    subset.sort_values('datetime', inplace=True)
    subset_resample = subset.groupby('location').resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                                          on='datetime', label='left').mean().interpolate(methods='linear')
    year_juv_list.append(subset_resample)
X_wild_juv = pd.concat(year_juv_list).reset_index().set_index('datetime')

In [11]:
## Temperature
site_data = site_data[site_data['location'] == 'Glacier']

site_data['datetime'] = pd.to_datetime(site_data[['year', 'month', 'day']])

year_temp_list = []
for year in analysis_years:
    subset = site_data.loc[(site_data['datetime'].dt.year == year), ['datetime', 'temp', 'location']]
    for loc in wild_locations:
        subset = subset.append({
            'datetime': datetime.datetime(year, 1 , 1),
            'location': loc,
            'temp': np.nan
        }, ignore_index=True)
        subset = subset.append({
            'datetime': datetime.datetime(year, 12 , 31),
            'location': loc,
            'temp': np.nan
        }, ignore_index=True)
    subset.sort_values('datetime', inplace=True)
    subset.sort_values('datetime', inplace=True)
    subset_resample = subset.groupby('location').resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                                          on='datetime', label='left').mean().interpolate(methods='linear')
    year_temp_list.append(subset_resample)
X_wild_temp = pd.concat(year_temp_list).reset_index().set_index('datetime')

In [12]:
## Salinity
year_sal_list = []
for year in analysis_years:
    subset = site_data.loc[(site_data['datetime'].dt.year == year), ['datetime', 'salt', 'location']]
    for loc in wild_locations:
        subset = subset.append({
            'datetime': datetime.datetime(year, 1 , 1),
            'location': loc,
            'salt': np.nan
        }, ignore_index=True)
        subset = subset.append({
            'datetime': datetime.datetime(year, 12 , 31),
            'location': loc,
            'salt': np.nan
        }, ignore_index=True)
    subset.sort_values('datetime', inplace=True)
    subset_resample = subset.groupby('location').resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                                          on='datetime', label='left').mean().interpolate(method='linear')
    year_sal_list.append(subset_resample)
X_wild_sal = pd.concat(year_sal_list).reset_index().set_index('datetime')

In [13]:
## Farm data
relevant_farms_iterable = ['Sargeaunt Pass',
                           'Doctor Islets',
                           'Humphrey Rock',
                           'Burdwood',
                           'Glacier Falls',
                           'Sir Edmund Bay',
                           'Wicklow Point'
                          ]

relevant_farm_data = industry_data[industry_data['Site Common Name'].str.contains('|'.join(relevant_farms_iterable))]

relevant_farm_data['Day'] = 1
relevant_farm_data['month'] = relevant_farm_data['Month'].map(month_map)
relevant_farm_data['datetime'] = pd.to_datetime(relevant_farm_data[['Year', 'month', 'Day']])

relevant_farm_data = relevant_farm_data[relevant_farm_data['datetime'].dt.year.isin(analysis_years)]

year_industry_list = []
for year in analysis_years:
    subset = relevant_farm_data.loc[(relevant_farm_data['datetime'].dt.year == year), 
                                   ['datetime', 'Site Common Name', 'Average L. salmonis motiles per fish']]
    
    for i, farm in enumerate(relevant_farms_iterable):
        subset = subset.append({
            'datetime': datetime.datetime(year, 1 , 1),
            'Site Common Name': farm,
            'Average L. salmonis motiles per fish': np.nan
        }, ignore_index=True)
        subset = subset.append({
            'datetime': datetime.datetime(year, 12 , 31),
            'Site Common Name': farm,
            'Average L. salmonis motiles per fish': np.nan
        }, ignore_index=True)
            
    subset.sort_values('datetime', inplace=True)
    subset_resample = subset.groupby('Site Common Name').resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                                                   on='datetime', label='left').mean().interpolate(methods='linear')

    year_industry_list.append(subset_resample)
X_industry = pd.DataFrame(pd.concat(year_industry_list).reset_index().groupby('datetime')['Average L. salmonis motiles per fish'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [14]:
## Weather data
station_dir = '../station_data/'

station_files = os.listdir(station_dir)
station_files = [file for file in station_files if 'en_climate_daily_BC' in file]

stations_to_concat = []
for file in station_files:
    stations_to_concat.append(pd.read_csv(f'{station_dir}{file}'))
station_df = pd.concat(stations_to_concat)
station_df['datetime'] = pd.to_datetime(station_df['Date/Time'])

relevant_station_df = station_df[station_df['datetime'].dt.year.isin(analysis_years)]
year_station_list = []

for year in analysis_years:
    subset = relevant_station_df.loc[(relevant_station_df['datetime'].dt.year == year),
                                    ['datetime', 'Mean Temp (°C)']].rename({'Mean Temp (°C)': 'temp'})
    subset.sort_values('datetime', inplace=True)
    subset_resample = subset.resample(f'W-{get_dow(datetime.datetime(year, 1, 1))}',
                                      on='datetime', label='left').apply(np.nanmean).interpolate(methods='linear')
    year_station_list.append(subset_resample)
X_station = pd.concat(year_station_list).reset_index().set_index('datetime')


In [15]:
## Unified X
X_values = pd.concat([X_industry, X_wild_juv, X_wild_sal, X_wild_temp, X_station], axis=1).drop('location', axis=1)

In [16]:
column_name_dict = {'Average L. salmonis motiles per fish': 'industry', 'count': 'nonmotiles', 'salt': 'salt', 'temp': 'temp', 'Mean Temp (°C)': 'station_temp'}

In [17]:
X_values = X_values.rename(column_name_dict, axis=1)

In [18]:
## Overall dataset
data = pd.concat([X_values, Y_glacier], axis=1).rename({'count': 'motiles'}, axis=1).reset_index()
data['year'] = data['datetime'].dt.year
data = data[data['year'] != 2002] # remove dummy year
dummy_date_filter = (data['datetime'].dt.month == 12) & (data['datetime'].dt.day == 25) # have to filter out dummy dates
leap_date_filter = (data['datetime'].dt.month == 12) & (data['datetime'].dt.day == 30) # have to filter out artificial extra week from leap year
data = data[~(dummy_date_filter | leap_date_filter) ]

In [19]:
## Create dummy dates as year is just the sample
dummy_year_dates = data[data['year'] == 2003]['datetime']
data_test = data.copy()

data_test['datetime'] = pd.concat([dummy_year_dates] * data_test['year'].nunique(), axis=0).values
# data_test['week_of_year'] = list(range(1, 53)) * data_test['year'].nunique()
data_test['week_of_year'] = data_test['datetime'].dt.week

### Test different data input

In [22]:
fish_data_g = fish_data[fish_data['location'] == 'Glacier']

nonmotile_fish_data = fish_data_g[['Lep_cope', 'chalA', 'chalB', 'Caligus_cope', 'unid_cope', 'chal_unid']].sum(axis=1)
motile_fish_data = fish_data_g[['Lep_PAmale', 'Lep_PAfemale', 'Lep_male', 'Lep_gravid', 'Lep_nongravid', 'unid_PA', 'unid_adult']].sum(axis=1)
fish_data_dates = pd.to_datetime(fish_data_g[['day', 'month', 'year']])


fish_data_inputs = pd.concat([fish_data_dates, nonmotile_fish_data, motile_fish_data], axis=1).rename({0:'datetime', 1:'nonmotiles', 2:'motiles'}, axis=1).set_index('datetime')

In [23]:
site_data_g = site_data[site_data['location'] == 'Glacier']

site_data_dates = pd.to_datetime(site_data_g[['day', 'month', 'year']])
site_data_temp = site_data_g['temp']
site_data_salt = site_data_g['salt']


site_data_inputs = pd.concat([site_data_dates, site_data_temp, site_data_salt], axis=1).rename({0: 'datetime'}, axis=1).set_index('datetime')

In [24]:
relevant_farm_data = industry_data[industry_data['Site Common Name'].str.contains('|'.join(relevant_farms_iterable))]

relevant_farm_data['Day'] = 1
month_map = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}
relevant_farm_data['month'] = relevant_farm_data['Month'].map(month_map)
relevant_farm_data['datetime'] = pd.to_datetime(relevant_farm_data[['Year', 'month', 'Day']])

farm_data_inputs = relevant_farm_data[['datetime', 'Site Common Name', 'Average L. salmonis motiles per fish']].groupby('datetime').mean()
farm_data_inputs = farm_data_inputs.rename({'Average L. salmonis motiles per fish': 'agg_farm_motiles'}, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
station_dir = 'weather_data/'

station_files = os.listdir(station_dir)
station_files = [file for file in station_files if 'en_climate_daily_BC' in file]

stations_to_concat = []
for file in station_files:
    stations_to_concat.append(pd.read_csv(f'{station_dir}{file}'))
station_df = pd.concat(stations_to_concat)
station_df['datetime'] = pd.to_datetime(station_df['Date/Time'])

relevant_station_df = station_df[station_df['datetime'].dt.year.isin(analysis_years)]
weather_station_inputs = relevant_station_df[['datetime', 'Mean Temp (°C)']].rename({'Mean Temp (°C)': 'station_temp'}, axis=1).set_index('datetime')

In [26]:
all_input_data = weather_station_inputs.merge(fish_data_inputs, left_index=True, right_index=True, how='left')
all_input_data = all_input_data.merge(site_data_inputs, left_index=True, right_index=True, how='left')
all_input_data = all_input_data.merge(farm_data_inputs, left_index=True, right_index=True, how='left')

all_input_data = all_input_data[all_input_data.index.year.isin(analysis_years)]
input_data = all_input_data.resample('W').mean().reset_index()
# input_data['datetime'] = pd.to_datetime(input_data['datetime'])
# input_data = input_data.rename({'datetime': 'week_datetime'}, axis=1)
input_data['week_of_year'] = input_data['datetime'].dt.week
# input_data['year'] = input_data['year'].astype(int)
# input_data['week_of_year'] = input_data['week_of_year'].astype(int)

In [27]:
input_data = input_data.loc[:, input_data.columns != 'year']

## AutoML

Setup AutomL

In [19]:
# Y column
label = 'motiles'

In [25]:
from sklearn import metrics
from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
import logging

In [26]:
data_test = data_test[data_test['datetime'].dt.month <= 8]

In [27]:
locally_stored_runs = {}

In [None]:
automl_settings = {
    "iteration_timeout_minutes": 4,
    "experiment_timeout_minutes": 30,
    "enable_early_stopping": True,
    "primary_metric": 'normalized_mean_absolute_error',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "blacklist_models": ['LightGBM'],
    "n_cross_validations": 5,
}


for year in analysis_years:
    train = data_test[~(data_test['year'] == year)].sort_values('datetime').reset_index(drop=True)
    test = data_test[data_test['year'] == year].sort_values('datetime').reset_index(drop=True)
    
    automl_config = AutoMLConfig(task='regression',
                             training_data=train,
                             label_column_name=label,
                             **automl_settings)

    ws = Workspace.from_config()
    experiment = Experiment(ws, "EEB498")
    local_run = experiment.submit(automl_config, show_output=True)
    
    best_run, fitted_model = local_run.get_output()
    
    snapshot_directory='models/'
    os.rename('model.pkl', f'model_{year}.pkl')
    
    preds = fitted_model.predict(test.loc[test['year']==year, test.columns != 'motiles'])
    
    locally_stored_runs[year] = {
        'mse': metrics.mean_squared_error(test.loc[test['year']==year, 'motiles'].fillna(0), preds),
        'mae': metrics.mean_absolute_error(test.loc[test['year']==year, 'motiles'].fillna(0), preds),
        'predictions': preds,
        'best_run': best_run,
        'model': fitted_model,
    }
    
    test_data = test_predictions_given_first_n_data(test.loc[test['year']==year, test.columns != 'motiles'], 25)
    fig, ax = plt.subplots()
    ax.plot(fitted_model.predict(test_data))
    ax.plot(test.loc[test['year']==2008, 'motiles'])

Running on local machine
Parent Run ID: AutoML_6a3e2a0e-6140-4a37-b53b-759db2793b2e

Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing values imputation
STATUS:       FIXED
DESCRIPTION:  The training data had the following missing values which were resolved. Please review your data source for data quality issues and possibly filter out the rows with these missing values. If the missing values are expected, you can either accept the above imputation, or implement your own custom imputation that may be more appropriate


****************************************************************************************************
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         0   StandardScalerWrapper ElasticNet               0:00:15       0.0471    0.0471
         1   StandardScalerWrapper ElasticNet               0:00:14       0.0470    0.0470
         2   StandardScalerWrapper ElasticNet               0:00:14       0.0492    0.0470
         3   S

        27   StandardScalerWrapper ElasticNet               0:00:17       0.0472    0.0124
        28   StandardScalerWrapper ElasticNet               0:00:19       0.0499    0.0124
        29   MaxAbsScaler ElasticNet                        0:00:16       0.0614    0.0124
        30   VotingEnsemble                                 0:00:26       0.0085    0.0085
        31   StackEnsemble                                  0:00:55       0.0097    0.0085
Stopping criteria reached at iteration 31. Ending experiment.
Running on local machine
Parent Run ID: AutoML_c147ab41-c143-4fa0-873e-74ebf9543940

Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

************

PARAMETERS:   Column name : industry, Imputation type : mean
              Column name : temp, Imputation type : mean
              
TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.

****************************************************************************************************
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      

        24   StandardScalerWrapper ElasticNet               

In [33]:
best_run, fitted_model = local_run.get_output()

In [47]:
test = fitted_model

In [55]:
def test_predictions_given_first_n_data(data, n):
    test_data = data.copy()
    test_data.iloc[25:, 1:6] = np.nan
    return test_data

In [1]:
automl_info_raw = {2003: {'mse': 0.022721197546207945,
  'mae': 0.11669384249363644,
  'predictions': [0.03640573, 0.03640573, 0.04797311, 0.04602518, 0.08560173,
         0.03640573, 0.04797311, 0.04797311, 0.00575422, 0.00569935,
         0.00511387, 0.00611025, 0.00592927, 0.00318569, 0.00519563,
         0.01978225, 0.04945153, 0.10346317, 0.27016504, 0.27624634,
         0.48736403, 0.29852694, 0.29688699, 0.29698551, 0.30099863,
         0.30097915, 0.30090223, 0.30082596, 0.30082596, 0.30082596,
         0.30072661, 0.30082596, 0.30082596, 0.30090223, 0.30076532],
    'train_mae': 0.0100
  },
 2004: {'mse': 9.53395073912808,
  'mae': 2.0537649368805493,
  'predictions': [0.01169341, 0.03676928, 0.01169341, 0.01169341, 0.01169341,
         0.01169341, 0.01169341, 0.01169341, 0.01169341, 0.        ,
         0.00474074, 0.00607407, 0.01766141, 0.00111111, 0.01026754,
         0.06895492, 0.07699537, 0.14971893, 0.22905149, 0.2150387 ,
         0.20940291, 0.22938707, 0.35869642, 0.27367115, 0.27367115,
         0.28414198, 0.28745541, 0.28777879, 0.28777879, 0.28777879,
         0.28777879, 0.28777879, 0.28777879, 0.28777879, 0.28777879],
  'train_mae': 0.0258
  },
 2005: {'mse': 0.0995597420352555,
  'mae': 0.23824512858026395,
  'predictions': [0.04112419, 0.04112419, 0.04112419, 0.07702824, 0.08074587,
         0.04112419, 0.04112419, 0.04112419, 0.04112419, 0.00553086,
         0.00600315, 0.13667372, 0.13667372, 0.13667372, 0.00600315,
         0.108798  , 0.19628494, 0.10761902, 0.16583831, 0.23864664,
         0.11213282, 0.11819458, 0.12688071, 0.12887638, 0.12887638,
         0.12887638, 0.23744059, 0.23744059, 0.23744059, 0.23744059,
         0.23744059, 0.23744059, 0.23744059, 0.23744059, 0.23744059],
  'train_mae': 0.0087
  },
 2006: {'mse': 0.008534451730800157,
  'mae': 0.07047807606265853,
  'predictions': [0.02793655, 0.02793655, 0.03144768, 0.03101417, 0.03175807,
         0.02793655, 0.03144768, 0.03144768, 0.03101417, 0.00933691,
         0.00056012, 0.00149959, 0.00055419, 0.00450837, 0.00542421,
         0.00089625, 0.00192171, 0.01426724, 0.02381679, 0.11087303,
         0.23891878, 0.38120575, 0.42819914, 0.39909043, 0.37124681,
         0.43349284, 0.40507504, 0.43195341, 0.41041036, 0.40265457,
         0.40235544, 0.40727347, 0.41032887, 0.41108709, 0.42282299],
  'train_mae': 0.0100
  },
 2007: {'mse': 0.002753848817817705,
  'mae': 0.04301749948252178,
  'predictions': [0.0807715 , 0.0807715 , 0.0807715 , 0.0807715 , 0.0807715 ,
         0.0807715 , 0.0807715 , 0.0807715 , 0.0807715 , 0.0807715 ,
         0.0807715 , 0.0025641 , 0.        , 0.00459399, 0.03341766,
         0.01181381, 0.02483105, 0.04993599, 0.05421498, 0.13639709,
         0.23423848, 0.27381651, 0.23350108, 0.25184023, 0.27381651,
         0.27646671, 0.29406847, 0.29406847, 0.27646671, 0.27646671,
         0.27646671, 0.29406847, 0.27646671, 0.27646671, 0.27646671],
  'train_mae': 0.0085
  },
 2008: {'mse': 0.12654851789455543,
  'mae': 0.2300374734342253,
  'predictions': [0.02756532, 0.02756532, 0.02756532, 0.02553229, 0.02553229,
         0.02756532, 0.02756532, 0.02756532, 0.02553229, 0.02756532,
         0.02756532, 0.02756532, 0.        , 0.        , 0.0025641 ,
         0.03233333, 0.0701189 , 0.08104409, 0.04019049, 0.86228449,
         1.03298801, 0.15724201, 0.09796512, 0.10048856, 0.10180116,
         0.09371599, 0.10823446, 0.06505036, 0.07240196, 0.08203229,
         0.08335383, 0.0830614 , 0.0858371 , 0.08238119, 0.08335383],
  'train_mae': 0.0098
  },
 2009: {'mse': 0.04727254083834774,
  'mae': 0.16443828106075478,
  'predictions': [0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 ,
         0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 ,
         0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 , 0.0446403 ,
         0.07145043, 0.07145043, 0.10353403, 0.08411589, 0.13215198,
         0.12601038, 0.41015909, 0.5653654 , 0.53092705, 0.52005395,
         0.32957553, 0.3596909 , 0.3596909 , 0.3596909 , 0.32946057,
         0.32946057, 0.3596909 , 0.3596909 , 0.32946057, 0.32946057],
  'train_mae': 0.0095
  },
 2010: {'mse': 0.08107453762989339,
  'mae': 0.19721581805594535,
  'predictions': [0.01195157, 0.01278982, 0.02477208, 0.02477208, 0.02833271,
         0.01335508, 0.01195157, 0.02477208, 0.02617559, 0.01195157,
         0.01195157, 0.05482199, 0.        , 0.00231481, 0.00804725,
         0.08325746, 0.12934278, 0.07593148, 0.10088178, 0.26642866,
         0.24017608, 0.32613383, 0.51234308, 0.54139618, 0.53934487,
         0.54070324, 0.5208475 , 0.51748235, 0.51686834, 0.53754881,
         0.51359242, 0.51748235, 0.51748235, 0.53754881, 0.53754881],
  'train_mae': 0.0108
  },
 2011: {'mse': 0.0012851783600649122,
  'mae': 0.02150651338380984,
  'predictions': [0.00035389, 0.00035389, 0.00035389, 0.00035389, 0.00035389,
         0.00035389, 0.00035389, 0.00035389, 0.00035389, 0.00035389,
         0.00035389, 0.00035389, 0.        , 0.        , 0.        ,
         0.00226786, 0.03251735, 0.03033721, 0.08706567, 0.15978542,
         0.09121399, 0.03612368, 0.06456116, 0.06456116, 0.07692788,
         0.06541403, 0.05085621, 0.05085621, 0.05085621, 0.05085621,
         0.05085621, 0.05085621, 0.05085621, 0.05085621, 0.05085621],
  'train_mae': 0.0101
  },
 2012: {'mse': 0.01218804595330489,
  'mae': 0.10000213591803112,
  'predictions': [0.05019397, 0.05019397, 0.08116963, 0.08116963, 0.08116963,
         0.05019397, 0.08116963, 0.08116963, 0.08116963, 0.05019397,
         0.08116963, 0.08116963, 0.08116963, 0.02742857, 0.00028571,
         0.08666125, 0.08666125, 0.04630621, 0.00918793, 0.02147259,
         0.0346419 , 0.07571412, 0.01490221, 0.01894466, 0.01323386,
         0.04756922, 0.04756922, 0.04205274, 0.04205274, 0.04205274,
         0.04205274, 0.04205274, 0.04205274, 0.04205274, 0.04756922],
  'train_mae': 0.0099
  },
 2013: {'mse': 0.006581112013569391,
  'mae': 0.05170209983648096,
  'predictions':[7.47836172e-03, 7.47836172e-03, 7.09454504e-03, 7.09454504e-03,
         7.09454504e-03, 7.47836172e-03, 7.47836172e-03, 7.09454504e-03,
         7.09454504e-03, 7.47836172e-03, 7.47836172e-03, 7.00672302e-03,
         7.00672302e-03, 1.14587161e-03, 4.17320774e-04, 2.59811617e-04,
         5.21453935e-02, 2.37334530e-02, 8.37937944e-02, 2.36763241e-01,
         6.37776207e-02, 1.04482998e-01, 5.66379382e-02, 1.17893687e-01,
         1.17477777e-01, 1.53282482e-01, 1.51077189e-01, 1.47524788e-01,
         1.47524788e-01, 1.47524788e-01, 2.94062008e-01, 1.87770336e-01,
         1.87770336e-01, 1.65023737e-01, 1.65023737e-01],
  'train_mae': 0.0089
  },
 2014: {'mse': 0.007299844314156494,
  'mae': 0.06327621598572274,
  'predictions': [0.01113127, 0.01113127, 0.01222254, 0.01222254, 0.01230587,
         0.02332571, 0.02441698, 0.02441698, 0.02499114, 0.02389987,
         0.03323646, 0.03323646, 0.03321733, 0.02632829, 0.02966163,
         0.0255937 , 0.0255021 , 0.0543577 , 0.07846889, 0.06443977,
         0.06479549, 0.07233251, 0.02984042, 0.04310162, 0.02959191,
         0.04672425, 0.02658566, 0.02658566, 0.02001161, 0.0199005 ,
         0.02170474, 0.02060804, 0.02060804, 0.02179171, 0.02179171],
  'train_mae': 0.0091
  },
 2015: {'mse': 0.4046104943945638,
  'mae': 0.4272875017691028,
  'predictions': [0.08519016, 0.08519016, 0.08519016, 0.08519016, 0.08519016,
         0.08519016, 0.08519016, 0.08519016, 0.08519016, 0.08519016,
         0.08519016, 0.08519016, 0.08519016, 0.08850768, 0.10685682,
         0.11436559, 0.09932872, 0.14588505, 0.29183207, 0.29183207,
         0.29183207, 0.18152179, 0.19215904, 0.05088115, 0.04855987,
         0.06355432, 0.05924528, 0.05924528, 0.05924528, 0.05924528,
         0.05924528, 0.05924528, 0.05924528, 0.05924528, 0.07438911],
  'train_mae': 0.0110
  },
 2016: {'mse': 0.005374535087745641,
  'mae': 0.05260032285635656,
  'predictions': [0.01379038, 0.01461153, 0.01424419, 0.01424419, 0.01397097,
         0.01393853, 0.01397097, 0.01424419, 0.01397097, 0.01311035,
         0.01396394, 0.01358663, 0.01444117, 0.0166659 , 0.00451518,
         0.08573733, 0.03842886, 0.02198434, 0.02451663, 0.05329424,
         0.03659526, 0.03756506, 0.0316501 , 0.05083847, 0.03291896,
         0.04692465, 0.07526995, 0.04317287, 0.07541578, 0.07618588,
         0.04255767, 0.04178757, 0.04178757, 0.04178757, 0.04289074],
  'train_mae': 0.0093
  },
 2017: {'mse': 0.007286320159917405,
  'mae': 0.0660832121643928,
  'predictions': [0.02001213, 0.02001213, 0.01596451, 0.01596451, 0.01596451,
         0.02001213, 0.0266788 , 0.02263118, 0.02263118, 0.0266788 ,
         0.0266788 , 0.02263118, 0.02263118, 0.00666667, 0.00763889,
         0.02701867, 0.04080843, 0.06240109, 0.1189604 , 0.1259293 ,
         0.15054271, 0.16950948, 0.18371927, 0.28778439, 0.22520711,
         0.10197703, 0.09556783, 0.09556783, 0.09623449, 0.09623449,
         0.09623449, 0.09556783, 0.09556783, 0.09623449, 0.09623449],
  'train_mae': 0.0109
  }}

In [2]:
# save to json
import json

with open('sufficient_info.json', 'w') as fp:
    json.dump(automl_info_raw, fp)