In [60]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sn

In [61]:
library= pd.read_csv("input\library.csv")
cases=pd.read_csv('input\cases.csv')

In [62]:
library.columns

Index(['Outlook', ' Temperature', ' Humidity', ' Windy', ' Play'], dtype='object')

In [63]:
cases.columns

Index(['Outlook', ' Temperature', ' Humidity', ' Windy'], dtype='object')

In [64]:
library

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Rainy,Cool,Normal,False,Yes
5,Rainy,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Sunny,Mild,High,False,No
8,Sunny,Cool,Normal,False,Yes
9,Rainy,Mild,Normal,False,Yes


In [65]:
cases

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,Sunny,Mild,Normal,False
1,Rainy,Cool,Normal,False
2,Overcast,Cool,High,False
3,Sunny,Cool,High,True
4,Rainy,Hot,High,True
5,Rainy,Cool,High,True


# Retrieve
Tahap retrieve merupakan tahapan mencari solusi dari kasus lama yang relevan dengan kasus baru. Nilai kesamaan antara kasus lama dengan kasus baru akan dicari pada tahap ini.Terdapat beberapa metode retrieve yang bisa digunakan, yaitu nearest neighbor similarity, hamming distance similarity,  euclidean distance similarity, dan minkowski distance similarity,Penelitian ini menggunakan metode Mahalanobis distance untuk  menemukan solusi kasus lama yang relevan dengan kasus baru

### Indexing (One hot encoder)

In [66]:
# Pilih column dari library yang digunakan sebagai base cases, kecuali sebuah solusi
base = library.iloc[:, range(library.shape[1] - 1)]      # Exclude last column

# Initial One-hot encoding
base = pd.get_dummies(base).astype(np.int8)
problems = pd.get_dummies(cases).astype(np.int8)

In [67]:
base

Unnamed: 0,Outlook_Overcast,Outlook_Rainy,Outlook_Sunny,Temperature_ Cool,Temperature_ Hot,Temperature_ Mild,Humidity_ High,Humidity_ Normal,Windy_ False,Windy_ True
0,0,0,1,0,1,0,1,0,1,0
1,0,0,1,0,1,0,1,0,0,1
2,1,0,0,0,1,0,1,0,1,0
3,0,1,0,0,0,1,1,0,1,0
4,0,1,0,1,0,0,0,1,1,0
5,0,1,0,1,0,0,0,1,0,1
6,1,0,0,1,0,0,0,1,0,1
7,0,0,1,0,0,1,1,0,1,0
8,0,0,1,1,0,0,0,1,1,0
9,0,1,0,0,0,1,0,1,1,0


In [68]:
problems

Unnamed: 0,Outlook_Overcast,Outlook_Rainy,Outlook_Sunny,Temperature_ Cool,Temperature_ Hot,Temperature_ Mild,Humidity_ High,Humidity_ Normal,Windy_ False,Windy_ True
0,0,0,1,0,0,1,0,1,1,0
1,0,1,0,1,0,0,0,1,1,0
2,1,0,0,1,0,0,1,0,1,0
3,0,0,1,1,0,0,1,0,0,1
4,0,1,0,0,1,0,1,0,0,1
5,0,1,0,1,0,0,1,0,0,1


## Similarity Base-Problem (Retrieve)

In [69]:
# Iterasi ke semua problem cases
for i in range(problems.shape[0]):
    # Inverse covariance matrix untuk base cases
    covariance_matrix = base.cov()                                      # Covariance
    inverse_covariance_matrix = np.linalg.pinv(covariance_matrix)       # Inverse
    # Get case row untuk evaluate 
    case_row = problems.loc[i, :].values
    #  array Distance kosong untuk menyimpan mahalanobis distances obrained comparing each libary case
    distances = np.zeros(base.shape[0])
    # Iterasi untuk setiap base cases row
    for j in range(base.shape[0]):
        # Get base case row
        base_row = base.loc[j, :].values

        # Calculate mahalanobis distance between case row and base cases, and store it
        distances[j] = distance.mahalanobis(case_row, base_row, inverse_covariance_matrix)
retrieve=pd.Series(distances)
retrieve

0     3.853846
1     3.645345
2     3.959642
3     3.436675
4     3.076899
5     2.520284
6     3.628191
7     4.360344
8     4.138753
9     5.042114
10    5.402015
11    3.994935
12    5.476568
13    2.866977
dtype: float64

In [70]:
np.argmin(distances)

5

# Reuse

In [71]:
value = pd.Series(distances<distances.mean() * 0.8)
index = value.loc[value==True].index
reuse = base.iloc[index,:]
reuse

Unnamed: 0,Outlook_Overcast,Outlook_Rainy,Outlook_Sunny,Temperature_ Cool,Temperature_ Hot,Temperature_ Mild,Humidity_ High,Humidity_ Normal,Windy_ False,Windy_ True
4,0,1,0,1,0,0,0,1,1,0
5,0,1,0,1,0,0,0,1,0,1
13,0,1,0,0,0,1,1,0,0,1


# Revise
- Empety = True => KNN
- Empety = False => CBR

In [72]:
if reuse.empty:
    print("KNN")
    X_train = base      # Exclude last column
    y_train = library.iloc[:,-1]
    X_test = problems
    covariance_matrix = X_train.cov()                                      # Covariance
    inverse_covariance_matrix = np.linalg.pinv(covariance_matrix)       # Inverse

    model = KNeighborsClassifier(metric='mahalanobis',metric_params={"V":inverse_covariance_matrix})
    model.fit(X_train, y_train)
    y_test = pd.Series(model.predict(X_test))
    cases[" Play"] = y_test
    library = library.append(cases, ignore_index = True)     # Append to library
else:
    print("CBR")
    # Returns the index (row) of the minimum value in distances calculated
    min_distance_row = np.argmin(reuse)

    # Get solution based on index of found minimum distance, and append it to main library
    # From cases, append library 'similar' solution
    case = np.append(cases.iloc[i, :], library.iloc[min_distance_row, -1])
    # Store
    # Get as operable pandas Series
    case = pd.Series(case, index = library.columns)         # Case with Solution
    library = library._append(case, ignore_index = True)     # Append to library
    
    # Save 'covariance heat map (biased)' output as file
    sn.heatmap(np.cov(base, bias = True), annot = True, fmt = 'g')
    plt.gcf().set_size_inches(12, 6)
    plt.title(f'Covariance Heat map #{i} \n Library cases stored {j} - Base to solve problem {i}')
    plt.savefig(f'output/covariance_heat_map_{i}.png', bbox_inches='tight')
    plt.close()
    # # Move through all problem cases
    # for i in range(problems.shape[0]):
    #     # Get inverse covariance matrix for the base cases
    #     covariance_matrix = base.cov()                                      # Covariance
    #     inverse_covariance_matrix = np.linalg.pinv(covariance_matrix)       # Inverse

    #     # Get case row to evaluate
    #     case_row = problems.loc[i, :]

    #     # Empty distances array to store mahalanobis distances obtained comparing each library cases
    #     distances_revise = np.zeros(base.shape[0])

    #     # For each base cases rows
    #     for j in range(reuse.shape[0]):
    #         # Get base case row
    #         base_row = base.loc[j, :]

    #         # Calculate mahalanobis distance between case row and base cases, and store it
    #         distances_revise[j] = distance.mahalanobis(case_row, base_row, inverse_covariance_matrix)

    #     # Returns the index (row) of the minimum value in distances calculated
    #     min_distance_row = np.argmin(distances_revise)

    #     # Get solution based on index of found minimum distance, and append it to main library
    #     # From cases, append library 'similar' solution
    #     case = np.append(cases.iloc[i, :], library.iloc[min_distance_row, -1])

    #     # Store
    #     # Get as operable pandas Series
    #     case = pd.Series(case, index = library.columns)         # Case with Solution
    #     library = library.append(case, ignore_index = True)     # Append to library

    #     # Save 'covariance heat map (biased)' output as file
    #     sn.heatmap(np.cov(base, bias = True), annot = True, fmt = 'g')
    #     plt.gcf().set_size_inches(12, 6)
    #     plt.title(f'Covariance Heat map #{i} \n Library cases stored {j} - Base to solve problem {i}')
    #     plt.savefig(f'output/covariance_heat_map_{i}.png', bbox_inches='tight')
    #     plt.close()
    
    #     # Reuse
    #     base = library.iloc[:, range(library.shape[1] - 1)]     # Exclude last column (solution)
    #     base = pd.get_dummies(base)                             # Get new one-hot encoded base

CBR


# Retain

In [73]:
library.tail(problems.shape[0])

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
9,Rainy,Mild,Normal,False,Yes
10,Sunny,Mild,Normal,True,Yes
11,Overcast,Mild,High,True,Yes
12,Overcast,Hot,Normal,False,Yes
13,Rainy,Mild,High,True,No
14,Rainy,Cool,High,True,No


In [74]:
library

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Rainy,Cool,Normal,False,Yes
5,Rainy,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Sunny,Mild,High,False,No
8,Sunny,Cool,Normal,False,Yes
9,Rainy,Mild,Normal,False,Yes


In [75]:
library.to_csv("input\library.csv")

<bound method NDFrame.to_csv of      Outlook  Temperature  Humidity   Windy  Play
0      Sunny          Hot      High   False    No
1      Sunny          Hot      High    True    No
2   Overcast          Hot      High   False   Yes
3      Rainy         Mild      High   False   Yes
4      Rainy         Cool    Normal   False   Yes
5      Rainy         Cool    Normal    True    No
6   Overcast         Cool    Normal    True   Yes
7      Sunny         Mild      High   False    No
8      Sunny         Cool    Normal   False   Yes
9      Rainy         Mild    Normal   False   Yes
10     Sunny         Mild    Normal    True   Yes
11  Overcast         Mild      High    True   Yes
12  Overcast          Hot    Normal   False   Yes
13     Rainy         Mild      High    True    No
14     Rainy         Cool      High    True    No>