### **Import Packages**

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error, auc

from datetime import datetime
import calendar

import numpy as np
import pandas as pd

# for set directory 
import os 

# !pip install similaritymeasures 
# or conda install similaritymesures
import similaritymeasures

### **Set Path**

In [2]:
# Change This Path !!
os.chdir('/Users/seunguk/wisely/wisely')

print('Now you are in: ', os.getcwd())

Now you are in:  /Users/seunguk/wisely/wisely


### **Step 01: `import` and `organize` dataset**

In [28]:
# import .csv file 

# df_RR = pd.read_csv('OneOne_720_RR_AGG.csv', index_col=0)
df_RR = pd.read_csv('SubSub_720_RR_AGG.csv', index_col = 0)
df_RR.shape

(44, 724)

In [4]:
# preprocess DataFrame for use
df_RR = df_RR.loc[:, 'd0':'d720'].transpose()
df_RR.replace('[%]', r'', regex=True, inplace=True)
df_RR = df_RR.apply(pd.to_numeric)
df_RR.drop(columns='2018-09-01', axis=1, inplace=True)
df_RR.reset_index(drop=True, inplace=True)
df_RR.index.name = 'days cnt'

### **Step 02: `fit` rational function**

In [6]:
# Fitting function - rational
def func_rational(x, a, b, c, d):
    return np.polyval([a,b], x) / np.polyval([c, d, 1], x)

# Initial guess for the parameter
initialGuess_rational = [1.0, 1.0, 1.0, 1.0]

In [7]:
# Curve fitting for all base cohorts with over 720 days 
cohort_base = list(df_RR.loc[:, '2018-02-01':'2019-08-01'].columns)
list_days = list(range(0, 721))

df_RR_basefit = pd.DataFrame({'cohort_base':cohort_base})
df_RR_basefit.set_index('cohort_base', inplace=True)


for cohort in cohort_base:
    # Setup X,Y Data
    xBase = list_days
    yBase = df_RR.loc[:, cohort]
    
    # Perform Curve-Fitting 
    popt_base, pcov_base = curve_fit(func_rational, xBase, yBase, initialGuess_rational)
    # popt: parameter , pcov: variance
    
    # Update DataFrame with fitted parameters
    df_RR_basefit.loc[cohort, 'MSE'] = mean_squared_error(yBase, func_rational(xBase, *popt_base), squared = False) 
    df_RR_basefit.loc[cohort, 'fitted_a'] = popt_base[0]
    df_RR_basefit.loc[cohort, 'fitted_b'] = popt_base[1]
    df_RR_basefit.loc[cohort, 'fitted_c'] = popt_base[2]
    df_RR_basefit.loc[cohort, 'fitted_d'] = popt_base[3]

### **Step 03: `average parameter` apporoach**

In [9]:
# Setup cohort & days range for calculation
cohort_fit = list(df_RR.loc[:, '2018-02-01':'2019-08-01'].columns)
days_fit = 60
dFit = np.linspace(0, days_fit, days_fit+1)

# Create DataFrame with trimmed RR data 
df_RR_shortTerm = pd.DataFrame()
for cohort in cohort_fit:
    df_RR_shortTerm[cohort] = df_RR.loc[:days_fit, cohort]

# Create DataFrame for Short-term RR Comparison
df_RR_comparison = pd.DataFrame(
    {'fit_cohort': cohort_fit, 'cohort_UB': '', 'MSE_UB': 10, 'cohort_LB': '', 'MSE_LB': 10,
    'cohort_Frechet': '', 'FrechetDist': ''})

df_RR_comparison.set_index('fit_cohort', inplace=True)

# Create DataFrame to store calculated RR & fitted values  
df_RR_pred = pd.DataFrame(columns={'cohort', 'predicted_RR', 'frechet_predicted_RR', 'fitParam_a', 'fitParam_b', 'fitParam_c', 'fitParam_d'})
df_RR_pred = df_RR_pred[ ['cohort', 'predicted_RR', 'frechet_predicted_RR', 'fitParam_a', 'fitParam_b', 'fitParam_c', 'fitParam_d']]

In [11]:
# Loop & compare by MSE and AUC
# plus, Frechet score

for cohort in cohort_fit:
    # Set up list of cohort for comparison (exclude self!)
    compareList = []
    compareList = cohort_fit[:]
    compareList.remove(cohort)
    
    # Calculate my AUC 
    myAUC = auc(dFit, df_RR_shortTerm[cohort])

    # Prepare Frechet Distance list
    list_frechet = []
    
    for chk in compareList:
        compareAUC = auc(dFit, df_RR_shortTerm[chk])
        MSE = mean_squared_error(df_RR_shortTerm[cohort], df_RR_shortTerm[chk], squared = False)

        # Update comparison info based on AUC and lowest MSE
        if (compareAUC < myAUC) & (MSE < df_RR_comparison.loc[cohort, 'MSE_LB']):
            df_RR_comparison.loc[cohort, 'MSE_LB'] = MSE
            df_RR_comparison.loc[cohort, 'cohort_LB'] = chk

        elif (compareAUC > myAUC) & (MSE < df_RR_comparison.loc[cohort, 'MSE_UB']):
            df_RR_comparison.loc[cohort, 'MSE_UB'] = MSE
            df_RR_comparison.loc[cohort, 'cohort_UB'] = chk

        # Calculate Frechet Distance 
        frechet_dist = similaritymeasures.frechet_dist(df_RR_shortTerm[cohort], df_RR_shortTerm[chk])
        list_frechet.append([cohort, chk, frechet_dist])
        list_frechet = sorted(list_frechet, key = lambda list_frechet: list_frechet[2])
        
    print(list_frechet)
    # import two value frechet dist. to dataframe
    df_RR_comparison.loc[cohort, 'cohort_Frechet'] = list_frechet[0][1]
    df_RR_comparison.loc[cohort, 'FrechetDist'] = list_frechet[0][2]
    df_RR_comparison.loc[cohort, 'mse_Frechet'] = mean_squared_error(df_RR_shortTerm[cohort], df_RR_shortTerm[list_frechet[0][1]], squared = False)

    df_RR_comparison.loc[cohort, 'cohort_Frechet_2'] = list_frechet[1][1]
    df_RR_comparison.loc[cohort, 'FrechetDist_2'] = list_frechet[1][2]
    df_RR_comparison.loc[cohort, 'mse_Frechet_2'] = mean_squared_error(df_RR_shortTerm[cohort], df_RR_shortTerm[list_frechet[1][1]], squared = False)
    print('--------------------------------------')

[['2018-02-01', '2018-06-01', 0.32999999999999996], ['2018-02-01', '2018-05-01', 0.3800000000000008], ['2018-02-01', '2019-08-01', 0.7300000000000001], ['2018-02-01', '2019-07-01', 0.86], ['2018-02-01', '2019-03-01', 0.92], ['2018-02-01', '2018-03-01', 1.0500000000000007], ['2018-02-01', '2019-04-01', 1.12], ['2018-02-01', '2019-06-01', 1.15], ['2018-02-01', '2018-11-01', 1.1899999999999995], ['2018-02-01', '2018-10-01', 1.2199999999999998], ['2018-02-01', '2018-04-01', 1.3800000000000008], ['2018-02-01', '2019-05-01', 1.48], ['2018-02-01', '2019-02-01', 1.5], ['2018-02-01', '2018-07-01', 1.5899999999999999], ['2018-02-01', '2018-12-01', 1.67], ['2018-02-01', '2019-01-01', 2.290000000000001], ['2018-02-01', '2018-08-01', 3.79]]
--------------------------------------
[['2018-03-01', '2018-04-01', 0.3500000000000001], ['2018-03-01', '2018-05-01', 0.6699999999999999], ['2018-03-01', '2018-06-01', 0.8499999999999996], ['2018-03-01', '2019-02-01', 0.94], ['2018-03-01', '2019-08-01', 0.96000

In [12]:
df_RR_comparison 

Unnamed: 0_level_0,cohort_UB,MSE_UB,cohort_LB,MSE_LB,cohort_Frechet,FrechetDist,mse_Frechet,cohort_Frechet_2,FrechetDist_2,mse_Frechet_2
fit_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-02-01,2019-08-01,0.359209,2019-03-01,0.507235,2018-06-01,0.33,0.533066,2018-05-01,0.38,0.631903
2018-03-01,2018-04-01,0.492761,2019-08-01,0.549411,2018-04-01,0.35,0.492761,2018-05-01,0.67,0.955106
2018-04-01,2019-05-01,0.560717,2018-03-01,0.492761,2018-03-01,0.35,0.492761,2019-02-01,0.91,0.750244
2018-05-01,2018-06-01,0.147276,2018-07-01,0.673384,2018-06-01,0.18,0.147276,2019-08-01,0.37,0.617285
2018-06-01,2018-12-01,0.396617,2018-05-01,0.147276,2018-05-01,0.18,0.147276,2018-02-01,0.33,0.533066
2018-07-01,2019-03-01,0.552602,2018-10-01,0.217308,2018-10-01,0.37,0.217308,2018-11-01,0.4,0.734032
2018-08-01,2018-11-01,1.312188,,10.0,2018-07-01,2.2,1.757191,2018-10-01,2.57,1.614688
2018-10-01,2018-07-01,0.217308,2018-11-01,0.564812,2018-11-01,0.14,0.564812,2018-07-01,0.37,0.217308
2018-11-01,2018-10-01,0.564812,2018-08-01,1.312188,2018-10-01,0.14,0.564812,2018-07-01,0.4,0.734032
2018-12-01,2019-08-01,0.500495,2018-06-01,0.396617,2019-02-01,0.34,1.795714,2019-01-01,0.62,1.275425


In [13]:
# Define custom MSE calculation function, based on given weight
def get_weighted_MSE(cohort, weight_1, weight_2):
    '''Sum of UB, LB cohorts'''
    weighted_y = (weight_1 * df_RR_shortTerm[df_RR_comparison.loc[cohort, 'cohort_UB']].values) + (weight_2 * df_RR_shortTerm[df_RR_comparison.loc[cohort, 'cohort_LB']].values)
    #print(weight_1, ' ', weight_2, ' ', mean_squared_error(df_RR_preCompare[cohort], weighted_y))
    return mean_squared_error(df_RR_shortTerm.loc[:, cohort], weighted_y, squared=False)

# Define custom weighted param function based on given weights
def get_weighted_param(cohort, weight_1, weight_2):
    if weight_1 == 0.0:
        return (weight_2 * df_RR_basefit.loc[df_RR_comparison.loc[cohort, 'cohort_LB'],'fitted_a':'fitted_d'].values)
    elif weight_2 == 0.0:
        return (weight_1 * df_RR_basefit.loc[df_RR_comparison.loc[cohort, 'cohort_UB'],'fitted_a':'fitted_d'].values)
    
    return (weight_1 * df_RR_basefit.loc[df_RR_comparison.loc[cohort, 'cohort_UB'],'fitted_a':'fitted_d'].values) + (weight_2 * df_RR_basefit.loc[df_RR_comparison.loc[cohort, 'cohort_LB'],'fitted_a':'fitted_d'].values)

In [14]:
# Loop fit cohort for RR prediction
tuning_range = 0.01
optimal_MSE, optimal_weight = 10, [0.0, 0.0]

for cohort in cohort_fit:
    # Tuning for optimal parameter weights 
    for i in range(int(1/tuning_range+1)):

        # Exception for distributions with highest/lowest RR 
        if df_RR_comparison.loc[cohort, 'cohort_UB'] == '':
            optimal_weight = [0.0, 1.0]
            optimal_MSE = df_RR_comparison.loc[cohort, 'MSE_LB']
            break
        elif df_RR_comparison.loc[cohort, 'cohort_LB'] == '':
            optimal_weight = [1.0, 0.0] 
            optimal_MSE = df_RR_comparison.loc[cohort, 'MSE_UB']
            break

        # Test the tuned parameter values 
        test_weight = [(i*tuning_range), (1-(i*tuning_range))]    
        test_MSE = get_weighted_MSE(cohort, *test_weight)

        # Update the optimal values
        if test_MSE < optimal_MSE:
            optimal_MSE = test_MSE
            optimal_weight = test_weight[:]

    fitParam = get_weighted_param(cohort, *optimal_weight)
    # param from frechet 
    param_frechet = df_RR_basefit.loc[df_RR_comparison.loc[cohort, 'cohort_Frechet'],
                                    'fitted_a': 'fitted_d'].values
    
    df_RR_pred = df_RR_pred.append(
        {'cohort':cohort, 'predicted_RR':func_rational(list_days, *fitParam)[-1], 
        'frechet_predicted_RR': func_rational(list_days, *param_frechet)[-1],
        'fitParam_a':fitParam[0], 'fitParam_b':fitParam[1], 'fitParam_c':fitParam[2], 'fitParam_d':fitParam[3]
        }, ignore_index=True)

### Frechet fitting

In [24]:
# print out Dataframe for prediction evaluation
test = df_RR_pred.loc[:, 'cohort':'frechet_predicted_RR']

test['actual_RR'] = df_RR.loc[:, cohort_fit].tail(1).transpose().values

test['diff_RR'] = abs(test['predicted_RR'] - test['actual_RR'])
test['diffmod_RR'] = (abs(test['predicted_RR'] - test['actual_RR']) / test['actual_RR']) * 100

test['diffFrechet_RR'] = abs(test['frechet_predicted_RR'] - test['actual_RR'])
test['diffmod_Frechet_RR'] = (abs(test['frechet_predicted_RR'] - test['actual_RR']) / test['actual_RR']) * 100

In [26]:
test.rename(columns = {'predicted_RR' : 'AUC_RR', 'frechet_predicted_RR' : 'Frechet_RR', 'actual_RR':'Actual_RR',
                        'diff_RR': 'AUC와 Actual 차이 %p', 'diffmod_RR': 'AUC 실측오차율',
                        'diffFrechet_RR': 'Frechet와 Actual 차이 %p', 'diffmod_Frechet_RR': 'Frechet 실측오차율'}, inplace = True)
print(os.getcwd())

/Users/seunguk/wisely/wisely


In [20]:
#test.to_csv('/Users/seunguk/wisely/wisely/seunguk_frechet.csv', sep=',', na_rep='NaN')

In [27]:
compare_aucfrechet = test[['cohort', 'AUC 실측오차율', 'Frechet 실측오차율']]
print('AUC&MSE 가중치 평균 실측 오차율', round(compare_aucfrechet['AUC 실측오차율'].mean(), 4), '%')
print('Frechet 평균 실측 오차율', round(compare_aucfrechet['Frechet 실측오차율'].mean(),4) , '%')
compare_aucfrechet

AUC&MSE 가중치 평균 실측 오차율 4.711 %
Frechet 평균 실측 오차율 3.2327 %


Unnamed: 0,cohort,AUC 실측오차율,Frechet 실측오차율
0,2018-02-01,14.088924,4.008853
1,2018-03-01,9.414903,2.42714
2,2018-04-01,0.719378,5.064967
3,2018-05-01,2.433119,2.433119
4,2018-06-01,1.845934,1.863171
5,2018-07-01,3.61313,4.171018
6,2018-08-01,3.881392,0.667672
7,2018-10-01,5.363817,2.694242
8,2018-11-01,0.167463,1.311284
9,2018-12-01,11.355655,2.699433
