<a href="https://colab.research.google.com/github/Noor-Z1/Machine-Learning/blob/main/Grid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class DataLoader:
    @staticmethod
    def load_credit(file_path):
        dataset = []
        labels = []
        file = open(file_path, "r")
        for line in file:
            line = line.strip("\n\r")
            # to get rid of extra lines that do not contain any information
            if len(line) < 5:
                continue
            parts = line.split(" ")
            data = []
            numeric_transformer = float
            # Attribute 1: (qualitative) - Status of existing checking account
            attr_1 = {'A11': 0, 'A12': 1, 'A13': 2, 'A14': 3}
            data.append(attr_1[parts[0]]) #
            # Attribute 2: (numerical) - Duration in month
            data.append(numeric_transformer(parts[1]))
            # Attribute 3: (qualitative) Credit history
            attr_3 = {'A30':0, 'A31':1, 'A32':2, 'A33':3, 'A34':4}
            data.append(attr_3[parts[2]])
            # Attribute 4: (qualitative) Purpose
            attr_4 = {'A40': 0, 'A41': 1, 'A42': 2, 'A43': 3, 'A44': 4, 'A45': 5, 'A46': 6, 'A47': 7, 'A48': 8, 'A49': 9, 'A410': 10}
            data.append(attr_4[parts[3]])
            # Attribute 5: (numerical) # Credit amount
            data.append(numeric_transformer(parts[4]))
            # Attibute 6: (qualitative) Savings account / bonds
            attr_6 = {'A61': 0, 'A62': 1,'A63': 2,'A64': 3,'A65': 4}
            data.append(attr_6[parts[5]])
            # Attribute 7: (qualitative) Present employment since
            attr_7 = {'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4}
            data.append(attr_7[parts[6]])
            # Attribute 8: (numerical) Installment rate in percentage of disposable income
            data.append(numeric_transformer(parts[7]))
            # Attribute 9: (qualitative) Personal status and sex
            attr_9 = {'A91': 0, 'A92': 1, 'A93': 2, 'A94': 3, 'A95': 4}
            data.append(attr_9[parts[8]])
            # Attribute 10: (qualitative) Other debtors / guarantors
            attr_10 = {'A101': 0, 'A102': 1, 'A103': 2}
            data.append(attr_10[parts[9]])
            # Attribute 11: (numerical) Present residence since
            data.append(numeric_transformer(parts[10]))
            # Attribute 12: (qualitative) Property
            attr_12 = {'A121': 0, 'A122': 1, 'A123': 2, 'A124': 3}
            data.append(attr_12[parts[11]])
            # Attribute 13: (numerical) Age in years
            data.append(numeric_transformer(parts[12]))
            # Attribute 14: (qualitative) Other installment plans
            attr_14 = {'A141': 0, 'A142': 1, 'A143': 2}
            data.append(attr_14[parts[13]])
            # Attribute 15: (qualitative) Housing
            attr_15 = {'A151': 0, 'A152': 1, 'A153': 2}
            data.append(attr_15[parts[14]])
            # Attribute 16: (numerical) Number of existing credits at this bank
            data.append(numeric_transformer(parts[15]))
            # Attribute 17: (qualitative) Job
            attr_17 = {'A171': 0, 'A172': 1, 'A173': 2, 'A174': 3}
            data.append(attr_17[parts[16]])
            # Attribute 18: (numerical) Number of people being liable to provide maintenance for
            data.append(numeric_transformer(parts[17]))
            # Attribute 19: (qualitative) # Telephone
            attr_19 = {'A191': 0, 'A192': 1}
            data.append(attr_19[parts[18]])
            # Attribute 20: (qualitative) foreign worker
            attr_20 = {'A201': 0, 'A202': 1}
            data.append(attr_20[parts[19]])
            dataset.append(data)
            # 1 good, 2 bad for credit application
            labels.append(1 if parts[20] == "1" else 0)
        file.close()
        return np.array(dataset, dtype=np.float32), np.array(labels, dtype=np.int32)

    @staticmethod
    def load_credit_with_onehot(file_path):
        dataset = []
        labels = []
        file = open(file_path, "r")
        for line in file:
            line = line.strip("\n\r")
            # to get rid of extra lines that do not contain any information
            if len(line) < 5:
                continue
            parts = line.split(" ")
            data = []
            numeric_transformer = float
            # Attribute 1: (qualitative) - Status of existing checking account
            attr_1 = {'A11': [1,0,0,0], 'A12': [0,1,0,0], 'A13': [0,0,1,0], 'A14': [0,0,0,1]}
            data.extend(attr_1[parts[0]])  #
            # Attribute 2: (numerical) - Duration in month
            data.append(numeric_transformer(parts[1]))
            # Attribute 3: (qualitative) Credit history
            attr_3 = {'A30': [1,0,0,0,0], 'A31': [0,1,0,0,0], 'A32': [0,0,1,0,0], 'A33': [0,0,0,1,0], 'A34': [0,0,0,0,1]}
            data.extend(attr_3[parts[2]])
            # Attribute 4: (qualitative) Purpose
            attr_4 = {'A40': [1,0,0,0,0,0,0,0,0,0,0], 'A41': [0,1,0,0,0,0,0,0,0,0,0], 'A42': [0,0,1,0,0,0,0,0,0,0,0],
                      'A43': [0,0,0,1,0,0,0,0,0,0,0], 'A44': [0,0,0,0,1,0,0,0,0,0,0], 'A45': [0,0,0,0,0,1,0,0,0,0,0],
                      'A46': [0,0,0,0,0,0,1,0,0,0,0], 'A47': [0,0,0,0,0,0,0,1,0,0,0], 'A48': [0,0,0,0,0,0,0,0,1,0,0],
                      'A49': [0,0,0,0,0,0,0,0,0,1,0], 'A410': [0,0,0,0,0,0,0,0,0,0,1]}
            data.extend(attr_4[parts[3]])
            # Attribute 5: (numerical) # Credit amount
            data.append(numeric_transformer(parts[4]))
            # Attibute 6: (qualitative) Savings account / bonds
            attr_6 = {'A61': [1,0,0,0,0], 'A62': [0,1,0,0,0], 'A63': [0,0,1,0,0], 'A64': [0,0,0,1,0], 'A65': [0,0,0,0,1]}
            data.extend(attr_6[parts[5]])
            # Attribute 7: (qualitative) Present employment since
            attr_7 = {'A71': [1,0,0,0,0], 'A72': [0,1,0,0,0], 'A73': [0,0,1,0,0], 'A74': [0,0,0,1,0], 'A75': [0,0,0,0,1]}
            data.extend(attr_7[parts[6]])
            # Attribute 8: (numerical) Installment rate in percentage of disposable income
            data.append(numeric_transformer(parts[7]))
            # Attribute 9: (qualitative) Personal status and sex
            attr_9 = {'A91': [1,0,0,0,0], 'A92': [0,1,0,0,0], 'A93': [0,0,1,0,0], 'A94': [0,0,0,1,0], 'A95': [0,0,0,0,1]}
            data.extend(attr_9[parts[8]])
            # Attribute 10: (qualitative) Other debtors / guarantors
            attr_10 = {'A101': [1,0,0], 'A102': [0,1,0], 'A103': [0,0,1]}
            data.extend(attr_10[parts[9]])
            # Attribute 11: (numerical) Present residence since
            data.append(numeric_transformer(parts[10]))
            # Attribute 12: (qualitative) Property
            attr_12 = {'A121': [1,0,0,0], 'A122': [0,1,0,0], 'A123': [0,0,1,0], 'A124': [0,0,0,1]}
            data.extend(attr_12[parts[11]])
            # Attribute 13: (numerical) Age in years
            data.append(numeric_transformer(parts[12]))
            # Attribute 14: (qualitative) Other installment plans
            attr_14 = {'A141': [1,0,0], 'A142': [0,1,0], 'A143': [0,0,1]}
            data.extend(attr_14[parts[13]])
            # Attribute 15: (qualitative) Housing
            attr_15 = {'A151': [1,0,0], 'A152': [0,1,0], 'A153': [0,0,1]}
            data.extend(attr_15[parts[14]])
            # Attribute 16: (numerical) Number of existing credits at this bank
            data.append(numeric_transformer(parts[15]))
            # Attribute 17: (qualitative) Job
            attr_17 = {'A171': [1,0,0,0], 'A172': [0,1,0,0], 'A173': [0,0,1,0], 'A174': [0,0,0,1]}
            data.extend(attr_17[parts[16]])
            # Attribute 18: (numerical) Number of people being liable to provide maintenance for
            data.append(numeric_transformer(parts[17]))
            # Attribute 19: (qualitative) # Telephone
            attr_19 = {'A191': [1,0], 'A192': [0,1]}
            data.extend(attr_19[parts[18]])
            # Attribute 20: (qualitative) foreign worker
            attr_20 = {'A201': [1,0], 'A202': [0,1]}
            data.extend(attr_20[parts[19]])
            dataset.append(data)
            # 1 good, 2 bad for credit application
            labels.append(1 if parts[20] == "1" else 0)
        file.close()
        return np.array(dataset, dtype=np.float32), np.array(labels, dtype=np.int32)

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pickle
import numpy as np
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
import sklearn.ensemble as skle




def confidence_interval(x, N):

   lower = np.mean(x) - ((1.96) * ( np.std(x) / np.sqrt(N) ))
   upper = np.mean(x) + ((1.96) * ( np.std(x) / np.sqrt(N) ))

   return lower, upper


knn_parameter_grid = {"kneighborsclassifier__metric": ["cosine", "euclidean", "manhattan"],
                          "kneighborsclassifier__n_neighbors": [2, 3, 4]
                          }

svm_parameter_grid = {"svc__C": [0.1, 0.5],
              "svc__kernel": ["poly", "rbf"]
}


decision_tree_parameter_grid = {"decisiontreeclassifier__criterion" : ["gini", "entropy"],
                                  "decisiontreeclassifier__splitter" : ["best", "random"]
                                }

random_forest_parameter_grid = { "criterion" : ["gini", "entropy"],
                                "n_estimators" : [10,100]

 }


#please change datapath to wherever credit.data is stored in ur pc
data_path = "credit.data"
dataset, labels = DataLoader.load_credit_with_onehot(data_path)


outer_cross_validation = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=np.random.randint(1, 1000))
inner_cross_validation = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=np.random.randint(1, 1000))

knn_performance = []
svm_performance = []
decision_tree_performance =[]


knn_overall_performance = []
svm_overall_performance = []
decision_tree_overall_performance =[]
rf_overall_performance =[]


best_inner = {}
best_inner_svm = {}
best_inner_decision_tree = {}

all_combs =[]
all_comb_svm =[]
all_combs_decision_tree = []

i=0

for train_indices, test_indices in outer_cross_validation.split(dataset, labels):


    current_training_part = dataset[train_indices]
    current_training_part_label = labels[train_indices]


    print("---------------------------------------------------")
    print("Inner iteration number: %d \n" %(i+1))



    knn_pipeline = make_pipeline(MinMaxScaler(), KNeighborsClassifier())
    knn_grid_search = GridSearchCV(knn_pipeline, param_grid=knn_parameter_grid, refit=True, cv=inner_cross_validation, scoring="f1_micro")
    knn_grid_search.fit(current_training_part, current_training_part_label)



    best_inner[i] = knn_grid_search.best_params_
    all_comb = knn_grid_search.cv_results_['params']
    #current_best_hyperparamater's index in the params dictionary
    index = [k for k, val in enumerate( np.array(best_inner[i]) == np.array(all_comb)  ) if val]
    index= index[0]
    #print(index)

    #need this array to calculate it's confidence interval
    array_of_all_scores_of_current_best = [ knn_grid_search.cv_results_['split{i}_test_score'.format(i = k) ][index]  for k in range(25)]

    print("The current best hyperparameter for KNN in the inner CV: ")
    print(all_comb[index])
    print("Mean test score of this hyperparamter      : %f" %(knn_grid_search.cv_results_['mean_test_score'][index]))
    print( " With the confidence interval: [  %f   %f ] \n"  %(confidence_interval(array_of_all_scores_of_current_best, len(array_of_all_scores_of_current_best))))




    svm_pipeline = make_pipeline(MinMaxScaler(), SVC())
    svm_grid_search = GridSearchCV(svm_pipeline, param_grid=svm_parameter_grid, refit=True, cv=inner_cross_validation, scoring="f1_micro")
    svm_grid_search.fit(current_training_part, current_training_part_label)


    best_inner_svm[i] = svm_grid_search.best_params_

    all_comb_svm = svm_grid_search.cv_results_['params']

    #current_best_hyperparamater's index in the params dictionary
    index_svm = [k for k, val in enumerate( np.array(best_inner_svm[i]) == np.array(all_comb_svm)  ) if val]
    index_svm= index_svm[0]
    #print(index)

    #need this array to calculate it's confidence interval
    array_of_all_scores_of_current_best_svm = [ svm_grid_search.cv_results_['split{i}_test_score'.format(i = k) ][index_svm]  for k in range(25) ]

    print("The current best hyperparameter for SVM in the inner CV: ")
    print(all_comb_svm[index_svm])
    print("Mean test score of this hyperparamter : %f" %(svm_grid_search.cv_results_['mean_test_score'][index_svm]))
    print( " With the confidence interval: [  %f   %f ] \n"  %(confidence_interval(array_of_all_scores_of_current_best_svm, len(array_of_all_scores_of_current_best_svm))))





    decision_tree_pipeline  = make_pipeline(MinMaxScaler(), DecisionTreeClassifier())
    decision_tree_grid_search = GridSearchCV(decision_tree_pipeline, param_grid=decision_tree_parameter_grid, refit=True, cv=inner_cross_validation, scoring="f1_micro")
    decision_tree_grid_search.fit(current_training_part, current_training_part_label)


    best_inner_decision_tree[i] = decision_tree_grid_search.best_params_

    all_comb_decision_tree = decision_tree_grid_search.cv_results_['params']

    #current_best_hyperparamater's index in the params dictionary
    index_decision_tree = [k for k, val in enumerate( np.array(best_inner_decision_tree[i]) == np.array(all_comb_decision_tree)  ) if val]
    index_decision_tree = index_decision_tree[0]


    #need this array to calculate it's confidence interval
    array_of_all_scores_of_current_best_decision_tree = [ decision_tree_grid_search.cv_results_['split{i}_test_score'.format(i = k) ][index_decision_tree]  for k in range(25) ]

    print("The current best hyperparameter for decision tree in the inner CV: ")
    print(all_comb_decision_tree[index_decision_tree])
    print("Mean test score of this hyperparamter      : %f" %(decision_tree_grid_search.cv_results_['mean_test_score'][index_decision_tree]))
    print( " With the confidence interval: [  %f   %f ] \n"  %(confidence_interval(array_of_all_scores_of_current_best_decision_tree, len(array_of_all_scores_of_current_best_decision_tree))))





    #Random forest grid search is done manually
    current_test_part = dataset[test_indices]
    current_test_part_label = labels[test_indices]

    rf_performance = dict()

    for inner_train_indices, inner_test_indices in inner_cross_validation.split(current_training_part, current_training_part_label):
      for l in range(10):

        inner_training_dataset = current_training_part[inner_train_indices]
        inner_training_label = current_training_part_label[inner_train_indices]

        inner_test_dataset = current_training_part[inner_test_indices]
        inner_test_label = current_training_part_label[inner_test_indices]

        inner_scaler = MinMaxScaler()
        inner_scaler.fit(inner_training_dataset)
        scaled_inner_training_dataset = inner_scaler.transform(inner_training_dataset)

        scaled_inner_test_dataset = inner_scaler.transform(inner_test_dataset)

        for criterion in random_forest_parameter_grid["criterion"]:
              for n_estimators in random_forest_parameter_grid["n_estimators"]:

                classifier = skle.RandomForestClassifier(criterion= criterion, n_estimators = n_estimators)
                classifier.fit(scaled_inner_training_dataset, inner_training_label)

                predicted = classifier.predict(scaled_inner_test_dataset)

                if (l, criterion, n_estimators) not in rf_performance:
                    rf_performance[(l, criterion, n_estimators)] = []
                rf_performance[(l, criterion,  n_estimators)].append(f1_score(inner_test_label, predicted, average="micro"))


    best_parameter_RF = None
    best_score_RF = -float('inf')

    mean =[]
    items_list =[]

    for items in rf_performance:
     items_list.append(items)
     mean.append(np.mean(rf_performance[items]))


    inner_winner = items_list[np.argmax(np.array(mean))]


    print("The current best Random Forest hyperparameter inside inner CV:")
    print(inner_winner)
    print("With mean score of %f" %(np.max(np.array(mean))))
    print("And confidence interval of [%f   %f]"   %(confidence_interval(rf_performance[inner_winner], len(rf_performance[inner_winner]))))




    outer_scaler = MinMaxScaler()
    outer_scaler.fit(current_training_part)


    rf_with_best_param = skle.RandomForestClassifier(criterion = inner_winner[1], n_estimators = inner_winner[2] )
    rf_with_best_param.fit(outer_scaler.transform(current_training_part), current_training_part_label)




    i=i+1


    knn_predicted = knn_grid_search.predict(current_test_part)
    knn_overall_performance.append(f1_score(current_test_part_label, knn_predicted, average="micro"))

    svm_predicted = svm_grid_search.predict(current_test_part)
    svm_overall_performance.append(f1_score(current_test_part_label, svm_predicted, average="micro"))

    decision_tree_predicted = decision_tree_grid_search.predict(current_test_part)
    decision_tree_overall_performance.append(f1_score(current_test_part_label, decision_tree_predicted, average="micro"))


    rf_predicted = rf_with_best_param.predict(outer_scaler.transform(current_test_part))
    rf_overall_performance.append(f1_score(current_test_part_label, rf_predicted, average="micro"))





print("----------------------------------------------------------------------------------------------")
print("Different algorithms performance on the outer test set  (WRT F1 score): ")
print("KNN OVERALL PERFORMANCE: ")
print(np.mean(knn_overall_performance))
#print(knn_overall_performance)
print("With the confidence interval: [  %f   %f ] \n" %(confidence_interval(knn_overall_performance, len(knn_overall_performance))))

print("SVM OVERALL PERFORMANCE: ")
print(np.mean(svm_overall_performance))
print("With the confidence interval: [  %f   %f ] \n" %(confidence_interval(svm_overall_performance, len(svm_overall_performance))))


print("DECISION TREE OVERALL PERFORMANCE: ")
print(np.mean(decision_tree_overall_performance))
print("With the confidence interval: [  %f   %f ] \n" %(confidence_interval(decision_tree_overall_performance, len(decision_tree_overall_performance))))



print("RF OVERALL PERFORMANCE: ")
print(np.mean(rf_overall_performance))
print("With the confidence interval: [ %f  %f] \n" %(confidence_interval(rf_overall_performance, len(rf_overall_performance))))












---------------------------------------------------
Inner iteration number: 1 

The current best hyperparameter for KNN in the inner CV: 
{'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 3}
Mean test score of this hyperparamter      : 0.723748
 With the confidence interval: [  0.711189   0.736307 ] 

The current best hyperparameter for SVM in the inner CV: 
{'svc__C': 0.5, 'svc__kernel': 'poly'}
Mean test score of this hyperparamter : 0.763670
 With the confidence interval: [  0.751914   0.775425 ] 

The current best hyperparameter for decision tree in the inner CV: 
{'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__splitter': 'random'}
Mean test score of this hyperparamter      : 0.679241
 With the confidence interval: [  0.665878   0.692604 ] 

The current best Random Forest hyperparameter inside inner CV:
(6, 'entropy', 100)
With mean score of 0.772984
And confidence interval of [0.762105   0.783863]
-------------------------