In [38]:
# Loading the libraries
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [39]:
all_data = pd.read_csv(r'C:\Users\Gebruiker\Documents\thesiscode\experimental\preprocessed_wine.csv', index_col=0)
errors = all_data['errors']
clustering_features_preprocessed = all_data.drop(['predicted_class', 'true_class', 'errors'], axis=1)


In [40]:
to_scale = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
clustering_features_preprocessed[to_scale] = StandardScaler().fit_transform(clustering_features_preprocessed[to_scale])

clustering_features[:2]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,clusters,new_clusters,true_class,predicted_class,errors
12,0.687846,-0.160373,0.141557,-0.694417,-0.834587,0.004312,0.267992,-0.318793,0.017405,0.697247,0.405843,-0.146241,1.256507,1,0.0,0.0,0,0.0
23,-0.356751,-0.343481,0.502356,-0.101678,-0.381526,-0.222556,-0.278564,-0.635733,-0.650337,-0.367414,-0.033392,1.394286,0.408775,1,0.0,0.0,1,1.0


In [83]:
# is used to calculate the F-score of the instances in the clusters.
# the lower the f score is, the worse the performance is 

# requires the all_data dataframe (with predicted and true class, but the errors col is not needed)
def F_score(results, class_number):
    true_pos = results.loc[results["true_class"] == class_number][results["predicted_class"] == class_number]
    true_neg = results.loc[results["true_class"] != class_number][results["predicted_class"] != class_number]
    false_pos = results.loc[results["true_class"] != class_number][results["predicted_class"] == class_number]
    false_neg = results.loc[results["true_class"] == class_number][results["predicted_class"] != class_number]
    
    try:
        precision =  len(true_pos)/(len(true_pos) + len(false_pos))
    except ZeroDivisionError:
        return 0
    try:
        recall = len(true_pos)/(len(true_pos) + len(false_neg))
    except ZeroDivisionError:
        return 0

    # TODO solve the "float division by zero" error
    try:
        f_score = 2 * ((precision * recall)/(precision + recall))
    except ZeroDivisionError:
        return 0

    return f_score

# Calculating the macro average F-score --> will eventually be replaced with weighted F-score
def mean_f_score(results):
    classes = results['true_class'].unique()
    class_list = []
    for i in classes:
        class_i = F_score(results, i)
        class_list.append(class_i)
    mean_f_score = (sum(class_list))/len(classes)
    print('this is the mean F-score of all classes within this cluster in the list: ', mean_f_score)

    return(mean_f_score)

# Calculating the bias for each cluster
def calculate_bias(clustered_data, cluster_number):
    cluster_x = clustered_data.loc[clustered_data["clusters"] == cluster_number]
    remaining_clusters = clustered_data.loc[clustered_data["clusters"] != cluster_number]
    
    # Bias definition: 
    return mean_f_score(remaining_clusters) - mean_f_score(cluster_x)


In [42]:
clus_model_kwargs = {
    "init": "k-means++",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 2,
}


In [43]:
# calculates the variances of the errors column
def calculate_variance(data):
    # Receives the data within one cluster to calculate the variance

    # Obtain errors column
    errors_col = data['errors']
    # Number of observations
    n = len(data)
    # Mean of the data
    mean = sum(errors_col)/n
    # Squared deviation 
    deviations = [(x - mean) ** 2 for x in errors_col]
    # Variance
    variance = sum(deviations) / n
    return variance

def get_highest_var_cluster(data):
    clusters = data['clusters'].unique()
    highest_variance = 0
    best_cluster = None
    cluster_number = None
    for i in clusters:
        print('this is i:', i)
        cluster_i = data[data['clusters'] == i]
        variance_cluster = calculate_variance(cluster_i)
        print('variance cluster:', variance_cluster)
        print('highest variance:', highest_variance)

        if variance_cluster > highest_variance:
            highest_variance = variance_cluster
            best_cluster = cluster_i
            cluster_number = i
            print('this is the cluster number:', cluster_number)

    return cluster_number


In [138]:
# Initialisation of clustering_features
clustering_features_preprocessed['clusters'] = 1
clustering_features = clustering_features_preprocessed
all_data.head(5) #true class doesnt seem to have the right values? I see no 2 or 3

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,predicted_class,true_class,errors
12,13.75,1.73,2.41,16.0,89.0,2.6,2.76,0.29,1.81,5.6,1.15,2.9,1320.0,0,0.0,0.0
23,12.85,1.6,2.52,17.8,95.0,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015.0,1,0.0,1.0
25,13.05,2.05,3.22,25.0,124.0,2.63,2.68,0.47,1.92,3.58,1.13,3.2,830.0,0,0.0,0.0
35,13.48,1.81,2.41,20.5,100.0,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920.0,1,0.0,1.0
13,14.75,1.73,2.39,11.4,91.0,3.1,3.69,0.43,2.81,5.4,1.25,2.73,1150.0,2,0.0,1.0
65,12.37,1.21,2.56,18.1,98.0,2.42,2.65,0.37,2.08,4.6,1.19,2.3,678.0,0,1.0,1.0
48,14.1,2.02,2.4,18.8,103.0,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060.0,2,0.0,1.0
78,12.33,0.99,1.95,14.8,136.0,1.9,1.85,0.35,2.76,3.4,1.06,2.31,750.0,1,1.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,0.0,0.0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0,1,0.0,1.0


In [135]:
MAX_ITER = 10 # --> in each iteration we split on the cluster with the highest variance
x = 1
bias_prev_iteration_cluster = 0

for i in range(1, MAX_ITER):
    print('Value for x:', x)
    # select the instances with cluster X as dataset
    high_bias_cluster = clustering_features.loc[clustering_features['clusters'] == x]
    high_bias_cluster.drop('clusters', axis=1)

    # Apply Kmeans on this dataset
    kmeans_algo = KMeans(n_clusters=2, **clus_model_kwargs).fit(high_bias_cluster) 
    high_bias_cluster['new_clusters'] = pd.DataFrame(kmeans_algo.predict(high_bias_cluster),index=high_bias_cluster.index) 

    # print("where is the NaN")
    # print(high_bias_cluster)
    # print('amount of rows with Nan:', high_bias_cluster.isna().sum())
    
    new_clustering_features = clustering_features

    new_clustering_features['clusters'] = high_bias_cluster['new_clusters'].combine_first(new_clustering_features['clusters'])
    print('new clustering features before merge:', new_clustering_features.head(5))
    # new_clustering_features = new_clustering_features.join(all_data[['true_class', 'predicted_class']], how='left')
    # new_clustering_features = new_clustering_features.join(all_data['true_class'], how='left')
    #todo: solve merge/join error here
    new_clustering_features = new_clustering_features.merge(all_data[['true_class', 'predicted_class']], on=all_data[['true_class', 'predicted_class']], how='left')
    print('new clustering features after merge:', new_clustering_features.head(5))
    # calculate bias for each of the two clusters
    negative_bias_0 = calculate_bias(new_clustering_features, 0)
    negative_bias_1 = calculate_bias(new_clustering_features, 1)

    new_clustering_features.drop(['true_class', 'predicted_class'], axis=1)

    if max(negative_bias_0, negative_bias_1) >= bias_prev_iteration_cluster:
        # select cluster with highest negative bias and merge new cluster assignments with the df
        bias_prev_iteration_cluster = max(negative_bias_0, negative_bias_1)
        clustering_features = new_clustering_features
    else:
        break
        print('no clusters with a higher bias are left')
    
    clustering_features = clustering_features.join(errors, how='left')
    x = get_highest_var_cluster(clustering_features)

    clustering_features.drop('errors', axis=1)


Value for x: 1
new clustering features before merge:      alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
12  0.687846   -0.160373  0.141557          -0.694417  -0.834587   
23 -0.356751   -0.343481  0.502356          -0.101678  -0.381526   
25 -0.124619    0.290352  2.798346           2.269277   1.808272   
35  0.374467   -0.047692  0.141557           0.787430  -0.003974   
13  1.848509   -0.160373  0.075958          -2.209194  -0.683567   

    total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
12       0.004312    0.267992             -0.318793         0.017405   
23      -0.222556   -0.278564             -0.635733        -0.650337   
25       0.061029    0.155878              1.582845         0.227267   
35       0.193369    0.576306             -0.635733         0.112797   
13       0.949596    1.571320              1.160258         1.925240   

    color_intensity       hue  od280/od315_of_diluted_wines   proline  \
12         0.697247  0.405843   

KeyError:      true_class  predicted_class
12          0.0                0
23          0.0                1
25          0.0                0
35          0.0                1
13          0.0                2
65          1.0                0
48          0.0                2
78          1.0                1
3           0.0                0
6           0.0                1
42          0.0                1
2           0.0                2
29          0.0                0
45          0.0                1
106         1.0                1
94          1.0                2
5           0.0                0
53          0.0                1
93          1.0                1
41          0.0                2
54          0.0                1
24          0.0                0
64          1.0                2
28          0.0                0
89          1.0                0
92          1.0                1
79          1.0                0
14          0.0                2
44          0.0                2
66          1.0                2
85          1.0                2
99          1.0                1
57          0.0                0
71          1.0                2
11          0.0                0
36          0.0                2
62          1.0                2
0           0.0                0
27          0.0                0
98          1.0                2
20          0.0                1
77          1.0                1
30          0.0                2
17          0.0                0
59          1.0                2
21          0.0                0
55          0.0                2
16          0.0                0
91          1.0                0
100         1.0                1
74          1.0                0
87          1.0                2
90          1.0                0
84          1.0                2
18          0.0                0
97          1.0                1
61          1.0                2

In [None]:
        # new_clustering_features = clustering_features.join(high_bias_cluster['new_clusters'], how='left')
    
    # new_clustering_features['clusters'] = new_clustering_features['new_clusters'].combine_first(new_clustering_features['clusters'])
    # new_clustering_features.drop('new_clusters', axis=1)