In [None]:
from azure.ai.ml import MLClient
from azure.ai.ml import command, Input, Output
from azure.identity import DefaultAzureCredential
from azureml.core import Workspace, Dataset, Datastore
import os
import pandas as pd
import numpy as np

# GLOBAL FUNCTIONS
import sys
sys.path.append("../../../")
from utils.data_processing import add_day_ahead_column
from utils.error_metrics import _calc_mae, _calc_mse, _calc_rmse, _calc_nrmse, _calc_mape, _calc_mase, _calc_msse, _seas_naive_fcst, _calc_metrics


#PLOTTING
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

# Set global parameters
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 12  # Font size
stanford_colors = ['#8C1515', '#175E54', '#279989', '#8F993E', '#6FA287', '#4298B5', 
                   '#007C92', '#E98300', '#E04F39', '#FEDD5C', '#620059', '#651C32', 
                   '#5D4B3C', '#7F7776', '#DAD7CB']
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=stanford_colors)

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
from config import subscription_id, resource_group, workspace_name

ml_client = MLClient(
    credential=credential,
    subscription_id = subscription_id,
    resource_group_name = resource_group,
    workspace_name = workspace_name
)

workspace = Workspace(subscription_id, resource_group, workspace_name)
datastore = Datastore.get(workspace, "workspaceblobstore")

# Test notebook for KNN

## 1. Load Data - NEW

In [None]:
# path to input file
data_path= "azureml://subscriptions/orkspaceblobstore/paths/LocalUpload/00_load_country.csv"

In [None]:
# validate the input data
short_path = 'LocalUpload' + data_path.split('LocalUpload')[1] 
dataset = Dataset.Tabular.from_delimited_files(path=(datastore, short_path))
df = dataset.to_pandas_dataframe()

df['ds'] = pd.to_datetime(df['ds'])
df['y'] = pd.to_numeric(df['y'], errors='coerce')
df = df.rename(columns={'country': 'ID'})
df['temp'] = pd.to_numeric(df['temp'], errors='coerce')

df = df[['ds', 'ID', 'temp', 'y']]

# 2. Implement kNN

### 2.1 Set parameters

In [None]:
# params
k = 3 # use the 3 most similar days
n = 24 # go 24 hours back
n_temp = 33 # go 33 hours back
cut_off = 14 # forecast the next day at 2pm
train_test_split = pd.to_datetime('2014-01-01 00:00:00')
IDs = df['ID'].unique()

### 2.2 Feature Vector Formation
Training Data: For each day in the training data, create a feature vector that includes the temperature forecast from  pm and the 'y' value from 24 hours before.
Testing Data: For each day in the testing data (2021), also create a feature vector using the same criteria.

In [None]:
# form feature vectors
def form_feature_vectors(df):
    X = []
    y = []

    min_day = df['ds'].min().date()
    max_day = df['ds'].max().date()
    skipped_days=[]

    for date in pd.date_range(start = min_day, end=max_day, freq='D'):
        log = True if date == min_day else False
        
        # get the slices for y and temp in the corresponding length (n and n_temp before the cut_off)
        X_slice_value = df[df['ds'] < pd.to_datetime(str(date) + ' 14:00:00')].tail(n)['y'].values
        X_slice_temp = df[df['ds'] < pd.to_datetime(str(date) + ' 14:00:00')].tail(n_temp)['temp'].values

        # get the values for the next day (midnight to midnight) --> plus one day

        y_slice_value = df[(df['ds'] >= pd.to_datetime(str(date + pd.Timedelta(days=1)) + ' 00:00:00')) &
                            (df['ds'] <= pd.to_datetime(str(date + pd.Timedelta(days=1)) + ' 23:00:00'))]['y'].values

        if log:
            print('X_slice_value', X_slice_value)
            print('X_slice_temp', X_slice_temp)
            print('y_slice_value', y_slice_value)

        # if the length of the slices is not equal to n or n_temp, skip this day
        if len(X_slice_value) != n or len(X_slice_temp) != n_temp or len(y_slice_value) != 24:
            skipped_days.append(date)
            continue

        # append the slices to the feature vectors
        X.append(np.append(X_slice_value, X_slice_temp))
        y.append(y_slice_value)
    
    print(skipped_days)
    return X, y

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize an empty list to store results
result_list = []

for id in IDs:
    print('ID', id)
    df_id = df[df['ID'] == id]
    # for every ID split in train and test set
    df_train, df_test = df_id[df_id['ds'] < train_test_split], df_id[df_id['ds'] >= train_test_split]

    X_train, y_train = form_feature_vectors(df_train)
    X_test, y_test = form_feature_vectors(df_test)

    knn = KNeighborsRegressor(n_neighbors=3)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    y_pred_full = np.concatenate((np.full(24, np.nan), y_pred.flatten(), np.full(24, np.nan)))
    df_test['y_pred'] = y_pred_full.flatten()
    
    # Append the test dataframe with predictions to the result list
    result_list.append(df_test)

# Concatenate the list of dataframes into a single dataframe
df_result = pd.concat(result_list)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

df_result = pd.DataFrame()

for id in IDs:
    print('ID', id)
    df_id = df[df['ID'] == id]
    # for every ID split in train and test set
    df_train, df_test = df_id[df_id['ds'] < train_test_split], df_id[df_id['ds'] >= train_test_split]

    X_train, y_train = form_feature_vectors(df_train)
    X_test, y_test = form_feature_vectors(df_test)

    knn = KNeighborsRegressor(n_neighbors=3)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    y_pred_full = np.concatenate((np.full(24, np.nan), y_pred.flatten(), np.full(24, np.nan)))
    df_test['y_pred'] = y_pred_full.flatten()
    
    df_result = df_result.append(df_test)

In [None]:
df_result.to_csv('result_knn.csv', index=False)

## 3. Evaluate

In [None]:
# add snaive for every ID by shifting
df_result['snaive'] = df_result.groupby('ID')['y'].shift(48)

In [None]:
metrics = pd.DataFrame(columns=['ID', 'RMSE', 'MAE', 'MAPE', 'MASE', 'MSSE'])

for id in IDs:
    df_id = df_result[df_result['ID'] == id]
    rmse = _calc_rmse(predictions=df_id['y_pred'], truth=df_id['y'])
    mae = _calc_mae(predictions=df_id['y_pred'], truth=df_id['y'])
    mape = _calc_mape(predictions=df_id['y_pred'], truth=df_id['y'])
    mase = _calc_mase(predictions=df_id['y_pred'], truth=df_id['y'], snaive_predictions=df_id['snaive'])
    msse = _calc_msse(predictions=df_id['y_pred'], truth=df_id['y'], snaive_predictions=df_id['snaive'])
    metrics = pd.concat([metrics, pd.DataFrame({'ID': id, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'MASE': mase, 'MSSE': msse}, index=[0])])

In [None]:
metrics.to_csv('metrics_knn.csv', index=False)