# Results visualization

In [67]:
# dataframe management
import os.path
import time

import numpy as np
import pandas as pd
from pyomo.environ import *
from sklearn import datasets
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics import completeness_score, homogeneity_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sorct import SORCT
from src.cluster import HierarchicalLogisticRegression, best_leaf_assignment
from src.utils import get_number_of_iterations
from sklearn.model_selection import KFold
from src.cluster import find_best_estimator
from pyomo.opt import SolverStatus, TerminationCondition
import pickle
import run_tests

In [73]:
res_path = "results"
#names = ["car", "iris", "seeds_data", "new_thyroid", "splice"]
names = ["iris", "car"]
path = os.path.join(res_path, "{}_results.csv".format(names[0]))
df = pd.read_csv(path, sep=" ", index_col=0)
rob = pd.read_csv("results/iris_sorct.csv", sep=" ", index_col=0)
rob

Unnamed: 0,Time_0,Iterations_0,Score_0,Score_Train_0,Time_1,Iterations_1,Score_1,Score_Train_1,Time_2,Iterations_2,Score_2,Score_Train_2,Time_3,Iterations_3,Score_3,Score_Train_3,Time_4,Iterations_4,Score_4,Score_Train_4
SORCT,0.186198,7.0,0.266667,0.35,3.768789,233.0,0.733333,0.708333,5.793448,253.0,0.6,0.683333,2.271399,129.0,0.633333,0.733333,3.261475,129.0,0.766667,0.7


In [74]:
df

Unnamed: 0,Time_0,HLR_Time_0,Iterations_0,HLR_Score_0,SORCT_Score_0,Homogeneity_0,Completeness_0,Time_1,HLR_Time_1,Iterations_1,...,HLR_Score_Train_0,SORCT_Score_Train_0,HLR_Score_Train_1,SORCT_Score_Train_1,HLR_Score_Train_2,SORCT_Score_Train_2,HLR_Score_Train_3,SORCT_Score_Train_3,HLR_Score_Train_4,SORCT_Score_Train_4
kmeans,6.788942,0.020671,278,0.7,0.6,0.621987,0.795115,3.710655,0.018154,143,...,0.6,0.741667,0.666667,0.708333,0.641667,0.683333,0.608333,0.733333,0.625,0.7
Agglomerative_sigle,8.302376,0.017759,385,0.7,0.6,0.621987,0.795115,0.209205,0.01634,6,...,0.6,0.741667,0.666667,0.333333,0.641667,0.683333,0.608333,0.733333,0.625,0.7
birch,0.19404,0.026767,8,0.7,0.266667,0.0,1.0,7.389553,0.01346,289,...,0.6,0.35,0.666667,0.708333,0.641667,0.683333,0.608333,0.733333,0.625,0.358333
True_labels,3.387764,0.018522,131,0.7,0.6,1.0,1.0,6.71528,0.01664,301,...,0.6,0.741667,0.666667,0.708333,0.641667,0.683333,0.608333,0.733333,0.625,0.7


In [75]:
IGNORE_HLR_TIME = True
N_FOLDS = 5
dfs = []
for file_index in range(len(names)):
    name = names[file_index]
    path = os.path.join(res_path, "{}_results.csv".format(name))
    df = pd.read_csv(path, sep=" ", index_col=0)
    sorct_df = pd.read_csv(os.path.join(res_path,"{}_sorct.csv".format(name)), sep=" ", index_col=0)
    if -1 in df or -2 in df or -3 in df:
        print("Some folds were not computed")
    res_index = df.index
    res_index = res_index.append(sorct_df.index)
    result_df = pd.DataFrame(index=res_index)
    
    
    for cl_name in res_index:
        
        
        n_invalid = 0
        n_invalid_sorct = 0
        time = 0
        hlr_time = 0
        iters = 0
        hlr_score = 0
        sorct_score = 0
        hs = 0
        cp = 0
        sorct_train_score = 0
        hlr_train_score = 0
        # sorct no init values
        no_init_time = 0
        no_init_iters = 0
        no_init_score = 0
        no_init_train_score = 0
        for fold_index in range(N_FOLDS):
            if cl_name != "SORCT":
                if df.loc[cl_name,"Time_{}".format(fold_index)] < 0:
                    print( df.loc[cl_name,"Time_{}".format(fold_index)])
                    n_invalid += 1
                    print(df.loc[cl_name,"Time_{}".format(fold_index)])
                else:
                    time += df.loc[cl_name,"Time_{}".format(fold_index)]
                    if not IGNORE_HLR_TIME:
                        hlr_time += df.loc[cl_name, "HLR_Time_{}".format(fold_index)]
                    iters += df.loc[cl_name, "Iterations_{}".format(fold_index)]
                    hlr_score += df.loc[cl_name, "HLR_Score_{}".format(fold_index)]
                    sorct_score += df.loc[cl_name, "SORCT_Score_{}".format(fold_index)]
                    hs += df.loc[cl_name, "Homogeneity_{}".format(fold_index)]
                    cp += df.loc[cl_name, "Completeness_{}".format(fold_index)]
                    sorct_train_score += df.loc[cl_name, "SORCT_Score_Train_{}".format(fold_index)]
                    hlr_train_score += df.loc[cl_name, "HLR_Score_Train_{}".format(fold_index)]
            # else sorct no init
            else:
                if  sorct_df.loc["SORCT", "Time_{}".format(fold_index) ] < 0:
                    n_invalid_sorct +=1
                else:
                    no_init_time += sorct_df.loc["SORCT", "Time_{}".format(fold_index) ]
                    no_init_iters +=  sorct_df.loc["SORCT", "Iterations_{}".format(fold_index) ]
                    no_init_score +=  sorct_df.loc["SORCT", "Score_{}".format(fold_index) ]
                    no_init_train_score += sorct_df.loc["SORCT", "Score_Train_{}".format(fold_index) ]
        if cl_name != "SORCT":
            real_folds = N_FOLDS - n_invalid
            time = time / real_folds
            hlr_time = hlr_time / real_folds
            iters = iters / real_folds
            hlr_score = hlr_score / real_folds
            sorct_score = sorct_score / real_folds
            hs = hs / real_folds
            cp = cp / real_folds
            sorct_train_score = sorct_train_score / real_folds
            hlr_train_score = hlr_train_score / real_folds
            result_df.loc[cl_name, "Time"] = time
            if not IGNORE_HLR_TIME:
                result_df.loc[cl_name, "HLR_Time"] = hlr_time
            result_df.loc[cl_name, "Iterations"] = iters
            result_df.loc[cl_name, "Train_HLR_Score"] = hlr_train_score
            result_df.loc[cl_name, "Train_SORCT_Score"] = sorct_train_score
            result_df.loc[cl_name, "HLR_Score"] = hlr_score
            result_df.loc[cl_name, "SORCT_Score"] = sorct_score
            result_df.loc[cl_name, "Homogeneity"] = hs
            result_df.loc[cl_name, "Completeness"] = cp
            result_df.loc[cl_name, "Invalid Folds"] = n_invalid 
        else:
            n_folds_reals = N_FOLDS - n_invalid_sorct
            result_df.loc[cl_name, "Invalid Folds"] = n_invalid_sorct 
            result_df.loc[cl_name, "Time"] = no_init_time / n_folds_reals
            result_df.loc[cl_name, "Iterations"] = no_init_iters / n_folds_reals
            result_df.loc[cl_name, "SORCT_Score"] = no_init_score / n_folds_reals
            result_df.loc[cl_name, "Train_SORCT_Score"] = no_init_train_score / n_folds_reals

    #result_df["Invalid Folds"] = result_df["Invalid Folds"].astype("int32")
    dfs.append(result_df)
    result_df.to_csv(os.path.join(res_path, "{}_final.csv".format(name)), float_format='%.2f')
    
        
    

-1.0
-1.0


In [76]:
dfs[1]

Unnamed: 0,Time,Iterations,Train_HLR_Score,Train_SORCT_Score,HLR_Score,SORCT_Score,Homogeneity,Completeness,Invalid Folds
kmeans,248.518142,1243.4,0.753853,0.54879,0.747187,0.516185,0.136548,0.082333,0.0
Agglomerative_sigle,103.241131,542.4,0.753853,0.660848,0.747187,0.624833,0.155252,0.093587,0.0
birch,128.442929,697.25,0.74293,0.624195,0.746546,0.560545,0.138434,0.090244,1.0
True_labels,141.129395,755.0,0.753853,0.655119,0.747187,0.616114,1.0,1.0,0.0
SORCT,127.01386,703.8,,0.565029,,0.540438,,,0.0


In [62]:
for df in dfs:
    index = list(df.index)
    fig, ax = plt.subplots()
    values = df["SORCT_Score"]
    values = values/values.loc["SORCT"]
    ax.bar(index, values)
    ax.set_ylim([0.8,1.1])

NameError: name 'plt' is not defined

In [None]:
for df in dfs:
    index = list(df.index)
    fig, ax = plt.subplots()
    values = df["Time"]
    values = values/values.loc["SORCT"]
    ax.bar(index, values)
    ax.set_ylim()

In [None]:
for df in dfs:
    index = list(df.index)
    fig, ax = plt.subplots()
    values = df["Iterations"]
    values = values/values.loc["SORCT"]
    ax.bar(index, values)

In [None]:
ls results

## Test train set 

In [None]:
dataset_name_list = ["car", "iris","new_thyroid", "seeds_data", "splice"]
N_SPLITS = 5
OPT_TYPE = "simple"
SEED = 1234
for dataset_name in dataset_name_list:
    if dataset_name == "iris":
        X, y = datasets.load_iris(as_frame=True, return_X_y=True)
        df = pd.DataFrame(X)
        df["Classes"] = y
    elif dataset_name == "car":
        dataset_path = os.path.join("datasets", "{}.csv".format(dataset_name))
        df = pd.read_csv(dataset_path, delimiter=";", header=0)
        df = df.convert_dtypes()
        # dictionary converting ordinal categories to values
        cost_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
        doors_dict = {"2": 2, "3": 3, "4": 4, "5more": 5}
        persons_dict = {"2": 2, "4": 4, "more": 5}
        dimension_dict = {"small": 0, "med": 1, "big": 2}
        # buying
        df["buying"] = df["buying"].apply(lambda x: cost_dict[x])
        df["maint"] = df["maint"].apply(lambda x: cost_dict[x])
        df["doors"] = df["doors"].apply(lambda x: doors_dict[x])
        df["persons"] = df["persons"].apply(lambda x: persons_dict[x])
        df["lug_boot"] = df["lug_boot"].apply(lambda x: dimension_dict[x])
        df["safety"] = df["safety"].apply(lambda x: cost_dict[x])
        classes_encoder = preprocessing.LabelEncoder().fit(df["Classes"])
        df["Classes"] = classes_encoder.transform(df["Classes"])
    else:
        dataset_path = os.path.join("datasets", "{}.csv".format(dataset_name))
        df = pd.read_csv(dataset_path, delimiter=";", header=0)
    if "Id" in df:
        df = df.drop('Id', axis=1)
    df_std = df.copy()
    scaler = MinMaxScaler()  # also MaxAbsScaler()
    # Preprocessing: we get the columns names of features which have to be standardized
    columns_names = list(df)
    index_features = list(range(0, len(df_std.columns) - 1))
    # The name of the classes K
    classes = df_std['Classes'].unique().tolist()
    classes_en = [i for i in range(len(classes))]
    # Encoder processing
    le = preprocessing.LabelEncoder()
    le.fit(df_std['Classes'])
    df_std['Classes'] = le.transform(df_std['Classes'])
    # Scaling phase
    df_std[columns_names[0:-1]] = scaler.fit_transform(df_std[columns_names[0:-1]])
    for column in columns_names[0:-1]:
        # TODO janky solution to unreliable MinMaxScaler behaviour
        df_std.loc[df[column] > 1, column] = 1
        df_std.loc[df[column] < 0, column] = 0

    X = df_std[columns_names[:-1]]
    y = df_std[columns_names[-1]]
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    n_leaves = 4
    fold_index = 0
    for train_index, test_index in kf.split(X, y):
        print("Fold", fold_index)
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        df_train = pd.concat([X_train, y_train], axis=1)
        df_test = pd.concat([X_test, y_test], axis=1)
        # sample weightingsorct_iters
        occurences = [len(y_train[y_train == x]) for x in classes]
        total_samples = sum(occurences)
        sample_weight = np.zeros_like(y_train)
        for class_index, n_occurr in zip(classes, occurences):
            sample_weight[y_train == class_index] = n_occurr
        sample_weight = sample_weight / total_samples
        #HLR_car_Agglomerative_sigle_0.pkl
        # HLR
        # true labels
        filename = "HLR_{}_{}_{}.pkl".format("tl",dataset_name, fold_index)
        with open(os.path.join("results", filename), "rb") as f:
            hlr = pickle.load(f)
        run_tests.create_model()
        
        fold_index += 1

In [None]:
ls results

In [None]:
with open("results/SORCT_new_thyroid_birch_4.pkl", "rb") as f:
    roba = pickle.load(f)

In [None]:
roba.keys()

In [None]:
roba

In [None]:
df_train.index[0]

In [None]:
aa = pd.DataFrame(index=["a", "b", "c"])

In [None]:
aa.loc["a","culo"] = 1

In [None]:
aa

In [None]:
aa.loc["d", "culo"] = 1

In [None]:
aa.index.append(["e"])

In [9]:
import train_test

In [10]:
dataset_path = os.path.join("datasets", "car.csv")
df = pd.read_csv(dataset_path, delimiter=";", header=0)
df = df.convert_dtypes()
# dictionary converting ordinal categories to values
cost_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
doors_dict = {"2": 2, "3": 3, "4": 4, "5more": 5}
persons_dict = {"2": 2, "4": 4, "more": 5}
dimension_dict = {"small": 0, "med": 1, "big": 2}
# buying
df["buying"] = df["buying"].apply(lambda x: cost_dict[x])
df["maint"] = df["maint"].apply(lambda x: cost_dict[x])
df["doors"] = df["doors"].apply(lambda x: doors_dict[x])
df["persons"] = df["persons"].apply(lambda x: persons_dict[x])
df["lug_boot"] = df["lug_boot"].apply(lambda x: dimension_dict[x])
df["safety"] = df["safety"].apply(lambda x: cost_dict[x])
classes_encoder = preprocessing.LabelEncoder().fit(df["Classes"])
df["Classes"] = classes_encoder.transform(df["Classes"])

In [11]:
N_SPLITS = 4
SEED = 1234
n_feature = len(df.columns)-1
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
X = df[df.columns[0:n_feature]]
y = df["Classes"]
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    train_index = df_train.index
    break
train_index

Int64Index([   1,    2,    3,    5,    6,    7,    8,    9,   10,   12,
            ...
            1715, 1716, 1717, 1718, 1719, 1720, 1722, 1724, 1725, 1726],
           dtype='int64', length=1296)

In [12]:
filename = "HLR_tl_car_0.pkl"
with open(os.path.join("results", filename), "rb") as f:
    hlr_dict = pickle.load(f)
    


In [13]:
OPT_TYPE = "simple"
init  = train_test.get_model_init(hlr_dict, range(0,6), train_index, [0,1,2])
model = train_test.create_model(init, OPT_TYPE, df_train)
hlr_score = train_test.predict(model, X_train, y_train, dataset_name)

NameError: name 'classes_en' is not defined