<a href="https://colab.research.google.com/github/OD1992/Python-programs/blob/main/Confidence_intervals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Code written by Ousmane Diao 
#*************************************************************************************************
# ***********************************************************************************
# Imports
# *******
%matplotlib inline
from ipywidgets import interact, interactive
from IPython.display import clear_output, display, HTML
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Poisson, NegativeBinomial, Gaussian
from statsmodels.genmod.families.links import identity, log, sqrt
import matplotlib.pyplot as plt
import math 
import statistics

In [24]:
df = pd.read_csv('Dakar.csv', header=0, infer_datetime_format=False, parse_dates=[0])
#***************************************************************************************************
#Defined the avaerage temperature
AVT_Dakar=(df.Tempmin_Dakar + df.Tempmax_Dakar)/2
AVT_Fatick=(df.Tempmin_Fatick + df.Tempmax_Fatick)/2
AVT_Kedougou=(df.Tempmin_Kedougou + df.Tempmax_Kedougou)/2

warnings.filterwarnings("ignore") 
#********* Initial condiftions
t_c = 84; t_e = 108; intercept = np.ones(t_e) 
sm.families.family.Poisson.links
g = sm.genmod.families.links.identity

In [33]:
#********************* Uncertainty program fucntion
def main_uncertainty(sw_regions, sw_method, sw_algo): 
    
    #************Define response variable and matrix of explanatory variables
    if sw_regions == 'Dakar':  
        y_o = df.MC_Dakar.values; lag=[2,2,5,1]
        X = np.c_[df.Rainfall, AVT_Dakar, df.Humidity_D, df.MC_Dakar, intercept]
    if sw_regions == 'Fatick':
        y_o = df.MC_Fatick.values; lag=[3,4,3,1]
        X = np.c_[df.Rainfall_F, AVT_Fatick, df.Humidity_F, df.MC_Fatick, intercept]      
    if sw_regions == 'Kedougou':
        y_o = df.MC_Kedougou.values; lag=[2,5,2,1]
        X = np.c_[df.Rainfall_K, AVT_Kedougou, df.Humidity_K, df.MC_Kedougou, intercept]
    y_test = y_o[t_c:t_e]
    
    #***************Define the different methods such as 'GLM default uncertainty', 'Parameter uncertainty' and 'stochastic uncertainty'.
    #*********** 'GLM default uncertainty'
    if sw_method=='GLM_95%_conf_int':
        t_i = 5
        y_train = y_o[t_i:t_c]  
        if sw_algo=='algo1':
            X_train = np.c_[X[t_i-lag[0]:t_c-lag[0],0], X[t_i-lag[1]:t_c-lag[1],1], X[t_i-lag[2]:t_c-lag[2],2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c-lag[0]:t_e-lag[0],0], X[t_c-lag[1]:t_e-lag[1],1], X[t_c-lag[2]:t_e-lag[2],2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        if sw_algo=='algo2':
            X_train = np.c_[X[t_i:t_c,0], X[t_i:t_c,1], X[t_i:t_c,2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c:t_e,0], X[t_c:t_e,1], X[t_c:t_e,2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        model = sm.GLM(y_train, X_train, family=Poisson(g())).fit()
        print(model.summary2())
        Pred = model.mu
        Pred_lower = X_train@model.conf_int()[:,0]
        Pred_upper = X_train@model.conf_int()[:,1]
        Forecasts = X_test@model.params
        Forecasts_lower = X_test@model.conf_int()[:,0]
        Forecasts_upper = X_test@model.conf_int()[:,1]
        percentile_lower = np.hstack((Pred_lower, Forecasts_lower))
        percentile_upper = np.hstack((Pred_upper, Forecasts_upper))
        percentile_50 = np.hstack((Pred, Forecasts))
        #print(np.c_[y_test, Forecasts_lower, Forecasts_upper])
        s = 0 #initialization of the counter
        for i in np.arange(0,len(y_test)):
            if y_test[i] <= Forecasts_upper[i] and y_test[i] >= Forecasts_lower[i]:
                s += 1        
        print("Percentage of real data located into the confidence interval: ", 100*s/len(y_test))
        Diff_1 = Forecasts_upper - Forecasts_lower
        plt.figure(figsize = [8, 8])
        plt.subplot(211)
        plt.plot(df.Date, y_o, color = "black", marker='o', markersize=8, label="Data")
        plt.plot(df.Date[t_i:t_c], Pred, '--', color='blue', label="Predictions")
        plt.plot(df.Date[t_c:t_e], Forecasts, '-.', color='red', label="Forecasts")
        plt.fill_between(df.Date[t_c:t_e], Forecasts_lower, Forecasts_upper, alpha=0.3, color='tab:orange')
        plt.title("Plot in "+str(sw_regions)+", method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.legend(fontsize=15)
        #plt.xlabel("Months",fontsize=20)
        plt.ylabel("Malaria incidence",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        #plt.show()  
        
        plt.subplot(212)
        plt.plot(Diff_1, label=str(sw_method))
        plt.title("Upper-lower with method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.ylabel("Values",fontsize=15)
        plt.xlabel("Months",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.show()
    
    #************* Parameter uncertainty
    #************ storage elements
    coeff = np.full((6,5), np.nan)
    coeff_low = np.full((6,5), np.nan)
    coeff_upper = np.full((6,5), np.nan)
    minimum = np.full(5, np.nan)
    maximum = np.full(5, np.nan)
    P_97_5 = np.full(5, np.nan)
    P_90 = np.full(5, np.nan)
    P_10 = np.full(5, np.nan)
    P_50 = np.full(5, np.nan)
    P_2_5 = np.full(5, np.nan)
    eps_tab_y = np.full(t_c-1, np.nan) 
    j=0 #initialization of itteration
    if sw_method=='Parameter':
        for t_i in [72,60,48,36,24,5]:
            y_train = y_o[t_i:t_c]  
            if sw_algo=='algo1':
                X_train = np.c_[X[t_i-lag[0]:t_c-lag[0],0], X[t_i-lag[1]:t_c-lag[1],1], X[t_i-lag[2]:t_c-lag[2],2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
                X_test = np.c_[X[t_c-lag[0]:t_e-lag[0],0], X[t_c-lag[1]:t_e-lag[1],1], X[t_c-lag[2]:t_e-lag[2],2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
            if sw_algo=='algo2':
                X_train = np.c_[X[t_i:t_c,0], X[t_i:t_c,1], X[t_i:t_c,2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
                X_test = np.c_[X[t_c:t_e,0], X[t_c:t_e,1], X[t_c:t_e,2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
            model = sm.GLM(y_train, X_train, family=Poisson(g())).fit()  
            coeff[j] = model.params
            coeff_low[j] = model.conf_int()[:,0]
            coeff_upper[j] = model.conf_int()[:,1]
            j=j+1
        for p in np.arange(0, np.shape(coeff)[1]):
            P_50[p] = np.percentile(coeff[:,p],50)
            P_97_5[p] = np.mean(coeff_upper[:,p])
            P_2_5[p] = np.mean(coeff_low[:,p])
        # Median
        predict_percentile_50 = X_train@P_50
        forecast_percentile_50 = X_test@P_50
        #  97.5 upper
        predict_97_5 = X_train@P_97_5
        forecast_97_5 = X_test@P_97_5
        #  2.5 upper
        predict_2_5 = X_train@P_2_5
        forecast_2_5 = X_test@P_2_5
        s = 0 #initialization of the counter
        for i in np.arange(0,len(y_test)):
            if y_test[i] <= forecast_97_5[i] and y_test[i] >= forecast_2_5[i]:
                s += 1        
        print("Percentage of real data located into the confidence interval: ", 100*s/len(y_test))  
        Diff_2 = forecast_97_5 - forecast_2_5
        #*********************Define plots
        plt.figure(figsize = [8, 8])
        plt.subplot(211)
        plt.plot(df.Date, y_o, color = "black", marker='o', markersize=8, label="Data")
        plt.plot(df.Date[t_i:t_c], predict_percentile_50, '--', color='blue', label="Predict")
        plt.plot(df.Date[t_c:t_e], forecast_percentile_50, '-.', color='red', label="Forecast")
        plt.fill_between(df.Date[t_c:t_e], forecast_2_5, forecast_97_5, alpha=0.3, color='tab:orange')
        plt.title("Plot in "+str(sw_regions)+", method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.legend(fontsize=15)
        #plt.xlabel("Months",fontsize=20)
        plt.ylabel("Malaria incidence",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        #plt.show() 
        
        plt.subplot(212)
        plt.plot(Diff_2)
        plt.title("Upper-lower with method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.xlabel("Months",fontsize=15)
        plt.ylabel("Values",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.show()
        
 
    
    #************* Stochastic uncertainty: here we present two methods where we add noise only in response variable (first) and we add noise in all variables (second).
    #************ storage elements
    num_iteration = 2000
    coeff = np.full((num_iteration,5), np.nan)
    coeff_low = np.full((num_iteration,5), np.nan)
    coeff_upper = np.full((num_iteration,5), np.nan)
    P_97_5 = np.full(5, np.nan)
    P_90 = np.full(5, np.nan)
    P_10 = np.full(5, np.nan)
    P_50 = np.full(5, np.nan)
    P_2_5 = np.full(5, np.nan)
    eps_tab_y = np.full(t_c-1, np.nan) 
    w=0
    #***************** Fisrt stoch uncert method
    if sw_method=='Stochastic_1': #We only add noise in the response variable
        t_i = 5
        y_train=y_o[t_i:t_c]  
        #************** Define protocol 1
        if sw_algo=='algo1':
            X_train = np.c_[X[t_i-lag[0]:t_c-lag[0],0], X[t_i-lag[1]:t_c-lag[1],1], X[t_i-lag[2]:t_c-lag[2],2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c-lag[0]:t_e-lag[0],0], X[t_c-lag[1]:t_e-lag[1],1], X[t_c-lag[2]:t_e-lag[2],2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        #************** Define protocol 2
        if sw_algo=='algo2':
            X_train = np.c_[X[t_i:t_c,0], X[t_i:t_c,1], X[t_i:t_c,2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c:t_e,0], X[t_c:t_e,1], X[t_c:t_e,2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        #***** Finding epsilon
        for i in np.arange(t_i, t_c-1):
            eps_tab_y[i] = np.abs(y_o[i+1] - y_o[i])
        epsilon_y = np.mean(eps_tab_y[t_i:])
        for m in range(num_iteration):
            #**** Adding noise to the initial data.
            NOISED_y_train    = y_train + np.random.uniform(-epsilon_y, epsilon_y, y_train.shape) 
            train_end = len(NOISED_y_train)
            model = sm.GLM(NOISED_y_train, X_train[:train_end], family=Poisson(g())).fit()  
            coeff[m] = model.params
            coeff_low[m] = model.conf_int()[:,0] #to save lower bound of parameters
            coeff_upper[m] = model.conf_int()[:,1] #to save upper bound of parameters
            w=w+1
        for p in np.arange(0, np.shape(coeff)[1]):
            P_50[p] = np.percentile(coeff[:,p],50)
            P_97_5[p] = np.mean(coeff_upper[:,p])
            P_2_5[p] = np.mean(coeff_low[:,p])
        # Median
        predict_percentile_50 = X_train@P_50
        forecast_percentile_50 = X_test@P_50
        #  97.5 upper
        predict_97_5 = X_train@P_97_5
        forecast_97_5 = X_test@P_97_5
        #  2.5 upper
        predict_2_5 = X_train@P_2_5
        forecast_2_5 = X_test@P_2_5
        s = 0 #initialization of the counter
        for i in np.arange(0,len(y_test)):
            if y_test[i] <= forecast_97_5[i] and y_test[i] >= forecast_2_5[i]:
                s += 1        
        print("Percentage of real data located into the confidence interval: ", 100*s/len(y_test))  
        Diff_3 = forecast_97_5 - forecast_2_5
        #*********************Define plots
        plt.figure(figsize = [8, 8])
        plt.subplot(211)
        plt.plot(df.Date, y_o, color = "black", marker='o', markersize=8, label="Data")
        plt.plot(df.Date[t_i:t_c], predict_percentile_50, '--', color='blue', label="Predict")
        plt.plot(df.Date[t_c:t_e], forecast_percentile_50, '-.', color='red', label="Forecast")
        plt.fill_between(df.Date[t_c:t_e], forecast_2_5, forecast_97_5, alpha=0.3, color='tab:orange')
        plt.title("Plot in "+str(sw_regions)+", method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.legend(fontsize=15)
        #plt.xlabel("Months",fontsize=20)
        plt.ylabel("Malaria incidence",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        #plt.show() 

        plt.subplot(212)
        plt.plot(Diff_3)
        plt.title("Upper-lower with method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.xlabel("Months",fontsize=15)
        plt.ylabel("Values",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.show()
    if sw_method=='Stochastic_2': #We add noise in all variables
        t_i = 5
        y_train = y_o[t_i:t_c]  
        if sw_algo=='algo1':
            X_train = np.c_[X[t_i-lag[0]:t_c-lag[0],0], X[t_i-lag[1]:t_c-lag[1],1], X[t_i-lag[2]:t_c-lag[2],2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c-lag[0]:t_e-lag[0],0], X[t_c-lag[1]:t_e-lag[1],1], X[t_c-lag[2]:t_e-lag[2],2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        if sw_algo=='algo2':
            X_train = np.c_[X[t_i:t_c,0], X[t_i:t_c,1], X[t_i:t_c,2], X[t_i-lag[3]:t_c-lag[3],3], X[t_i:t_c,4]]
            X_test = np.c_[X[t_c:t_e,0], X[t_c:t_e,1], X[t_c:t_e,2], X[t_c-lag[3]:t_e-lag[3],3], X[t_c:t_e,4]]
        #eps_tab_X = np.full((t_c-1,np.shape(X)[1]), np.nan) # storage for X_train
        for i in np.arange(t_i, t_c-1):
            eps_tab_y[i] = np.abs(y_o[i+1] - y_o[i])
        #print(eps_tab_y[t_i:])
        epsilon_y = np.mean(eps_tab_y[t_i:])
        X_r = X[:, 0]
        X_t = X[:,1]
        X_h = X[:,2]
        X_mc = X[:,3]
        eps_tab_r = np.full(t_c-1, np.nan) #storage for rainfall; 
        eps_tab_t = np.full(t_c-1, np.nan) #storage for temperature; 
        eps_tab_h = np.full(t_c-1, np.nan) #storage for humidity; 
        eps_tab_mc = np.full(t_c-1, np.nan) #storage for malaria in the past; 
        for i in np.arange(t_i, t_c-1):
            eps_tab_r[i] = np.abs(X_r[i+1] - X_r[i])
            eps_tab_t[i] = np.abs(X_t[i+1] - X_t[i])
            eps_tab_h[i] = np.abs(X_h[i+1] - X_h[i])
            eps_tab_mc[i] = np.abs(X_mc[i+1] - X_mc[i])
        epsilon_r = np.mean(eps_tab_r[t_i:])
        epsilon_t = np.mean(eps_tab_t[t_i:])
        epsilon_h = np.mean(eps_tab_h[t_i:])
        epsilon_mc = np.mean(eps_tab_mc[t_i:])
        for m in range(num_iteration):
            #**** Adding noise to the initial data.
            NOISED_y_train    = y_train + np.random.uniform(-epsilon_y, epsilon_y, y_train.shape)
            train_end = len(NOISED_y_train)
            NOISED_X_train_r    = X_train[:,0] + np.random.uniform(-epsilon_r, epsilon_r, X_train[:,0].shape)
            NOISED_X_train_t    = X_train[:,1] + np.random.uniform(-epsilon_t, epsilon_t, X_train[:,1].shape)
            NOISED_X_train_h    = X_train[:,2] + np.random.uniform(-epsilon_h, epsilon_h, X_train[:,2].shape)
            NOISED_X_train_mc    = X_train[:,3] + np.random.uniform(-epsilon_mc, epsilon_mc, X_train[:,3].shape)
            NOISED_X_train = np.c_[NOISED_X_train_r, NOISED_X_train_t, NOISED_X_train_h, NOISED_X_train_mc, X_train[:,4]]
            model = sm.GLM(NOISED_y_train, NOISED_X_train[:train_end], family=Poisson(g())).fit() 
            coeff[m] = model.params
            coeff_low[m] = model.conf_int()[:,0]
            coeff_upper[m] = model.conf_int()[:,1]
            w=w+1
        for p in np.arange(0, np.shape(coeff)[1]):
            P_50[p] = np.percentile(coeff[:,p],50)
            P_97_5[p] = np.mean(coeff_upper[:,p])
            P_2_5[p] = np.mean(coeff_low[:,p])
        # Median
        predict_percentile_50 = X_train@P_50
        forecast_percentile_50 = X_test@P_50
        #  97.5 upper
        predict_97_5 = X_train@P_97_5
        forecast_97_5 = X_test@P_97_5
        #  2.5 upper
        predict_2_5 = X_train@P_2_5
        forecast_2_5 = X_test@P_2_5
        s = 0 #initialization of the counter
        for i in np.arange(0,len(y_test)):
            if y_test[i] <= forecast_97_5[i] and y_test[i] >= forecast_2_5[i]:
                s += 1        
        print("Percentage of real data located into the confidence interval: ", 100*s/len(y_test))  
        Diff_3 = forecast_97_5 - forecast_2_5
        #*********************Define plots
        plt.figure(figsize = [8, 8])
        plt.subplot(211)
        plt.plot(df.Date, y_o, color = "black", marker='o', markersize=8, label="Data")
        plt.plot(df.Date[t_i:t_c], predict_percentile_50, '--', color='blue', label="Predict")
        plt.plot(df.Date[t_c:t_e], forecast_percentile_50, '-.', color='red', label="Forecast")
        plt.fill_between(df.Date[t_c:t_e], forecast_2_5, forecast_97_5, alpha=0.3, color='tab:orange')
        plt.title("Plot in "+str(sw_regions)+", method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.legend(fontsize=15)
        #plt.xlabel("Months",fontsize=20)
        plt.ylabel("Malaria incidence",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        #plt.show() 

        plt.subplot(212)
        plt.plot(Diff_3)
        plt.title("Upper-lower with method: "+str(sw_method)+"["+ str(sw_algo)+"]",fontsize=15)
        plt.xlabel("Months",fontsize=15)
        plt.ylabel("Values",fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.show()

#******************** Call the function
w = interactive(main_uncertainty, sw_regions = ['Dakar', 'Fatick', 'Kedougou'],
                sw_method = ['GLM_95%_conf_int', 'Parameter', 'Stochastic_1', 'Stochastic_2'],
                sw_algo =  ['algo1', 'algo2'])                     
display(w)
plt.show() 

interactive(children=(Dropdown(description='sw_regions', options=('Dakar', 'Fatick', 'Kedougou'), value='Dakar…