In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_temp_forecast.csv'
df = pd.read_csv(url)

" Handling some data "
df = df.drop(columns=['station','Date'])
df = df.dropna()
df.head()

Unnamed: 0,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,LDAPS_CC1,LDAPS_CC2,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
0,28.7,21.4,58.255688,91.116364,28.074101,23.006936,6.818887,69.451805,0.233947,0.203896,...,0.0,0.0,0.0,37.6046,126.991,212.335,2.785,5992.895996,29.1,21.2
1,31.9,21.6,52.263397,90.604721,29.850689,24.035009,5.69189,51.937448,0.225508,0.251771,...,0.0,0.0,0.0,37.6046,127.032,44.7624,0.5141,5869.3125,30.5,22.5
2,31.6,23.3,48.690479,83.973587,30.091292,24.565633,6.138224,20.57305,0.209344,0.257469,...,0.0,0.0,0.0,37.5776,127.058,33.3068,0.2661,5863.555664,31.1,23.9
3,32.0,23.4,58.239788,96.483688,29.704629,23.326177,5.65005,65.727144,0.216372,0.226002,...,0.0,0.0,0.0,37.645,127.022,45.716,2.5348,5856.964844,31.7,24.3
4,31.4,21.9,56.174095,90.155128,29.113934,23.48648,5.735004,107.965535,0.151407,0.249995,...,0.0,0.0,0.0,37.5507,127.135,35.038,0.5055,5859.552246,31.2,22.5


In [4]:
df.dtypes

Present_Tmax        float64
Present_Tmin        float64
LDAPS_RHmin         float64
LDAPS_RHmax         float64
LDAPS_Tmax_lapse    float64
LDAPS_Tmin_lapse    float64
LDAPS_WS            float64
LDAPS_LH            float64
LDAPS_CC1           float64
LDAPS_CC2           float64
LDAPS_CC3           float64
LDAPS_CC4           float64
LDAPS_PPT1          float64
LDAPS_PPT2          float64
LDAPS_PPT3          float64
LDAPS_PPT4          float64
lat                 float64
lon                 float64
DEM                 float64
Slope               float64
Solar radiation     float64
Next_Tmax           float64
Next_Tmin           float64
dtype: object

In [5]:
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 23 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [6]:
data_df = df.drop(columns=['Next_Tmax','Next_Tmin'])
target_df = df['Next_Tmax']

In [7]:
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

Present_Tmax        False
Present_Tmin        False
LDAPS_RHmin         False
LDAPS_RHmax         False
LDAPS_Tmax_lapse    False
LDAPS_Tmin_lapse    False
LDAPS_WS            False
LDAPS_LH            False
LDAPS_CC1           False
LDAPS_CC2           False
LDAPS_CC3           False
LDAPS_CC4           False
LDAPS_PPT1          False
LDAPS_PPT2          False
LDAPS_PPT3          False
LDAPS_PPT4          False
lat                 False
lon                 False
DEM                 False
Slope               False
Solar radiation     False
dtype: bool

In [8]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

[]

In [9]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['Present_Tmax',
 'Present_Tmin',
 'LDAPS_RHmin',
 'LDAPS_RHmax',
 'LDAPS_Tmax_lapse',
 'LDAPS_Tmin_lapse',
 'LDAPS_WS',
 'LDAPS_LH',
 'LDAPS_CC1',
 'LDAPS_CC2',
 'LDAPS_CC3',
 'LDAPS_CC4',
 'LDAPS_PPT1',
 'LDAPS_PPT2',
 'LDAPS_PPT3',
 'LDAPS_PPT4',
 'lat',
 'lon',
 'DEM',
 'Slope',
 'Solar radiation']

In [10]:
data_df = pd.concat([data_df[numerical_cols_names].astype(float), data_df[categorical_cols_names]],axis = 1)
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [11]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['Next_Tmax'])
target_test_df = test_df['Next_Tmax']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['Next_Tmax'])
target_train_df = train_df['Next_Tmax']

In [12]:
" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

In [13]:
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [14]:
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

In [15]:
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

###  One hot encoding 

In [16]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')
df_neigh.head()

Unnamed: 0,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,LDAPS_CC1,LDAPS_CC2,...,LDAPS_CC4,LDAPS_PPT1,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation
0,23.920973,17.21949,39.498749,75.987474,25.716597,15.482582,11.649685,138.461254,0.06125,0.041374,...,0.412827,0.253449,0.130638,0.046574,-0.086291,37.607566,126.987934,210.397208,2.77939,4633.977883
1,23.712622,17.485822,39.631054,76.159475,26.297136,15.765924,11.835588,139.496087,0.00291,0.019792,...,0.40684,0.086534,-0.012401,0.058849,0.256254,37.614574,127.010764,219.402128,2.92281,4694.121591
2,23.92375,17.376452,37.944531,76.328909,26.28522,15.601202,11.667421,138.019221,-0.011973,0.001624,...,0.408866,0.10068,-0.10526,0.048015,-0.078001,37.6105,126.989913,212.736347,2.883223,4678.961408
3,23.877884,17.487214,39.723517,76.38498,26.538964,15.880048,11.826471,142.770519,0.03747,0.01434,...,0.400041,0.064649,-0.036073,0.018204,-0.27814,37.599801,126.993946,211.848031,2.765221,4705.963567
4,24.114361,17.379955,39.276373,75.534605,26.350988,15.786941,11.804712,139.59062,0.022244,0.003388,...,0.44147,0.14015,-0.037919,0.007945,0.024942,37.594192,126.973929,199.419125,2.597796,4723.425476


In [17]:
" Store the neighbors in a list "
data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [18]:
data_train = data_train_df.values
target_train = target_train_df.values

In [19]:
data_test = data_test_df.values
target_test = target_test_df.values

In [20]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [21]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [22]:
" Linear Regression : "
regression = LinearRegression()
model_reg = regression.fit(data_train, target_train)
target_pred_reg = model_reg.predict(data_test)

In [35]:
" Random Forest Regressor "
regr = RandomForestRegressor(n_estimators=500,max_depth=10, random_state=0)
model_rf = regr.fit(data_train, target_train)
target_pred_rf = model_rf.predict(data_test)

In [24]:
" Ada Boost regressor "
adaBoostRegr = AdaBoostRegressor(random_state=0, n_estimators=200)
model_abr = adaBoostRegr.fit(data_train, target_train)
target_pred_abr = model_abr.predict(data_test)

In [25]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(500,500),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the Multi-Layer-Perceptron Regressor model is  : 0.9003


## Scores of the black box models 


In [36]:
print(f"{'The score of the linear regression model is ' :<60}{': {}'.format(round(model_reg.score(data_test, target_test),4))}")
print(f"{'The score of the Random Forest Regressor model is ':<60}{': {}'.format(round(model_rf.score(data_test, target_test),4))}")
print(f"{'The score of the AdaBoost Regressor model is ':<60}{': {}'.format(round(model_abr.score(data_test, target_test),4))}")
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the linear regression model is                 : 0.7884
The score of the Random Forest Regressor model is           : 0.8891
The score of the AdaBoost Regressor model is                : 0.7738
The score of the Multi-Layer-Perceptron Regressor model is  : 0.9003


## Execution of Split Based Selection Form Algorithm : 

In [27]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point)

In [28]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [29]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
0.0 < LDAPS_CC2 <= 0.58
5649.38 < Solar radiation <= 5968.81
0.18 < LDAPS_CC3 <= 0.46
83.56 < LDAPS_RHmax <= 100.0
37.46 < lat <= 37.57
20.28 < LDAPS_Tmax_lapse <= 29.84
-------------------------------------------------------------------
subrgoup 1
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
14.27 < LDAPS_Tmin_lapse <= 25.02
37.46 < lat <= 37.59
0.53 < LDAPS_CC4 <= 0.96
20.3 < Present_Tmax <= 25.5
-------------------------------------------------------------------
subrgoup 2
0.0 < LDAPS_CC2 <= 0.58
5229.8 < Solar radiation <= 5649.38
25.02 < LDAPS_Tmin_lapse <= 28.97
24.24 < LDAPS_RHmin <= 51.37
0.3 < LDAPS_CC4 <= 0.96
-------------------------------------------------------------------
subrgoup 3
0.0 < LDAPS_CC2 <= 0.58
5649.38 < Solar radiation <= 5968.81
0.0 < LDAPS_CC3 <= 0.46
80.91 < LDAPS_RHmax <= 83.56
14.27 < LDAPS_Tmin_lapse <= 23.47
-------------------------------------------------------------------
subrgoup 4
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar

24.24 < LDAPS_RHmin <= 67.51
23.75 < Present_Tmin <= 28.9
20.3 < Present_Tmax <= 30.0
0.0 < LDAPS_CC1 <= 0.68
-------------------------------------------------------------------
subrgoup 58
0.58 < LDAPS_CC2 <= 0.96
0.0 < LDAPS_PPT2 <= 2.16
24.24 < LDAPS_RHmin <= 67.51
23.75 < Present_Tmin <= 28.9
30.0 < Present_Tmax <= 36.5
0.54 < LDAPS_CC3 <= 0.98
-------------------------------------------------------------------
subrgoup 59
0.58 < LDAPS_CC2 <= 0.96
0.0 < LDAPS_PPT2 <= 2.16
24.24 < LDAPS_RHmin <= 67.51
23.75 < Present_Tmin <= 28.9
30.0 < Present_Tmax <= 36.5
0.0 < LDAPS_CC3 <= 0.54
-------------------------------------------------------------------
subrgoup 60
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
14.27 < LDAPS_Tmin_lapse <= 22.17
37.59 < lat <= 37.65
0.16 < LDAPS_CC4 <= 0.96
85.04 < LDAPS_RHmax <= 100.0
-------------------------------------------------------------------
subrgoup 61
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
14.27 < LDAPS_Tmin

20.28 < LDAPS_Tmax_lapse <= 30.86
24.24 < LDAPS_RHmin <= 52.99
20.3 < Present_Tmax <= 30.09
88.59 < LDAPS_RHmax <= 100.0
126.98 < lon <= 127.14
79.03 < LDAPS_LH <= 200.41
-------------------------------------------------------------------
subrgoup 95
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
14.27 < LDAPS_Tmin_lapse <= 25.02
37.46 < lat <= 37.59
0.0 < LDAPS_CC4 <= 0.53
20.28 < LDAPS_Tmax_lapse <= 30.86
24.24 < LDAPS_RHmin <= 52.99
20.3 < Present_Tmax <= 30.09
88.59 < LDAPS_RHmax <= 100.0
126.98 < lon <= 127.14
6.77 < LDAPS_LH <= 79.03
-------------------------------------------------------------------
subrgoup 96
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
25.02 < LDAPS_Tmin_lapse <= 28.97
51.37 < LDAPS_RHmin <= 95.11
0.27 < LDAPS_CC3 <= 0.35
126.83 < lon <= 126.95
87.69 < LDAPS_LH <= 200.41
-------------------------------------------------------------------
subrgoup 97
0.0 < LDAPS_CC2 <= 0.58
4371.68 < Solar radiation <= 5649.38
25.02 < LDAPS_Tmin_l

In [30]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')