In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

#Tutorial can be found here: https://www.youtube.com/watch?v=CIfZi0niNE0

In [131]:
dataset = pd.read_csv('CoffeeBeansCleaned.csv')

In [132]:
X = dataset

In [133]:
ct = ColumnTransformer(transformers = [("encoder",OneHotEncoder(),[-14,-13,-12])], remainder="passthrough")

In [134]:
X_train = np.array(ct.fit_transform(X))

In [135]:
#Clustering
km = KMeans(n_clusters = 3, init="k-means++", random_state=42)
y_means = km.fit(X_train)

In [136]:
y_means

KMeans(n_clusters=3, random_state=42)

In [137]:
#Centers
y_means.cluster_centers_

array([[3.80952381e-01, 3.57142857e-01, 2.61904762e-01, 4.04761905e-01,
        5.71428571e-01, 2.38095238e-02, 3.33333333e-01, 2.38095238e-01,
        1.42857143e-01, 1.19047619e-01, 9.52380952e-02, 7.14285714e-02,
        7.16071429e+00, 7.05357143e+00, 6.97023810e+00, 7.07142857e+00,
        6.97023810e+00, 1.00000000e+01, 7.00595238e+00, 1.00000000e+01,
        1.00000000e+01, 7.10714286e+00, 7.93392857e+01],
       [4.48275862e-01, 3.79310345e-01, 1.72413793e-01, 6.89655172e-01,
        2.93103448e-01, 1.72413793e-02, 1.89655172e-01, 1.37931034e-01,
        3.62068966e-01, 1.72413793e-01, 8.62068966e-02, 5.17241379e-02,
        6.81896552e+00, 6.56034483e+00, 6.42672414e+00, 6.38362069e+00,
        6.35344828e+00, 1.00000000e+01, 6.41810345e+00, 1.00000000e+01,
        1.00000000e+01, 6.43534483e+00, 7.53706897e+01],
       [2.68292683e-01, 3.04878049e-01, 4.26829268e-01, 7.07317073e-01,
        2.07317073e-01, 8.53658537e-02, 1.82926829e-01, 7.31707317e-02,
        1.21951220e-01

In [138]:
#Check where each record resides in which cluster
km.labels_

array([2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 0, 2, 1, 1, 0, 0, 0, 2, 0,
       2, 2, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1,
       0, 1, 1, 1, 2, 0, 0, 2, 1, 2, 2, 1, 2, 0, 0, 2, 0, 0, 2, 1, 0, 0,
       2, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2,
       0, 1, 2, 0, 0, 1, 0, 2, 1, 2, 2, 2, 1, 2, 0, 0, 2, 1, 2, 2, 2, 2,
       1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, 0, 1,
       2, 2, 2, 2, 2, 2])

In [139]:
#Count how many records are in each of the clusters
labels = km.labels_

clusterCount = np.bincount(labels)
clusterCount

array([42, 58, 82], dtype=int64)

In [145]:
#List the records in dataframe form

df_with_cluster_column = pd.read_csv('CoffeeBeansCleaned.csv')

cluster = km.labels_
df_with_cluster_column['cluster'] = cluster

df_with_cluster_column

Unnamed: 0,Variety,Shape,Sieve,Aroma,Flavor,Aftertaste,Acidity,Body,Uniformity,Balance,clean cup,Sweetness,Taster score,Final score,cluster
0,CO,PE,13,6.50,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.50,2
1,CO,PE,13,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.00,2
2,CO,PE,13,6.75,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.75,2
3,CO,PE,13,6.75,6.50,6.50,6.00,6.00,10,6.00,10,10,6.00,73.75,2
4,CO,MI,13,6.75,6.25,6.25,6.25,6.50,10,6.50,10,10,6.50,75.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,OA,FL,18,6.50,6.00,6.00,6.00,6.25,10,6.00,10,10,6.00,72.75,2
178,OA,FL,18,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.25,72.25,2
179,OA,FL,18,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.50,72.50,2
180,OA,FL,18,6.25,6.25,6.25,6.00,6.25,10,6.00,10,10,6.25,73.25,2


In [177]:
#List the elements of cluster n method
def display(fullList, clusternum):
    new_list = []
    for x in list_with_clusters:
        for y in x:
            if x[14] == clusternum:
                new_list.append(x)
                break
    unique_df = pd.DataFrame(new_list)
    return unique_df

In [178]:
list_with_clusters = df_with_cluster_column.values.tolist()

#List elements of cluster 0
df_n = display(list_with_clusters, 0)
df_n

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,CO,MI,14,7.0,7.0,7.25,7.25,7.0,10,7.0,10,10,7.25,79.75,0
1,CO,MI,14,7.75,7.25,7.25,7.25,7.0,10,7.0,10,10,7.25,80.75,0
2,CO,MI,14,7.5,7.0,7.0,7.0,6.5,10,7.25,10,10,7.25,79.5,0
3,CO,MI,15,7.0,7.0,7.0,6.75,7.0,10,6.75,10,10,6.75,78.25,0
4,CO,MI,15,7.25,7.25,7.0,7.0,6.75,10,7.25,10,10,7.0,79.5,0
5,CO,FL,13,7.0,7.0,7.0,7.25,7.0,10,7.0,10,10,7.0,79.25,0
6,CO,FL,13,7.5,7.25,7.5,7.25,7.25,10,7.25,10,10,7.25,81.25,0
7,CO,FL,13,7.0,7.0,7.0,7.0,6.5,10,7.0,10,10,7.0,78.5,0
8,CO,FL,16,7.0,7.25,7.25,7.25,7.25,10,7.25,10,10,7.5,80.75,0
9,CO,FL,16,7.25,7.25,6.75,7.0,7.0,10,6.5,10,10,6.75,78.5,0


In [179]:
#List elements of cluster 1
df_nplus1 = display(list_with_clusters, 1)
df_nplus1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,CO,MI,13,6.75,6.25,6.25,6.25,6.5,10,6.5,10,10,6.5,75.0,1
1,CO,MI,13,6.5,6.5,6.25,6.25,6.5,10,6.75,10,10,6.75,75.5,1
2,CO,MI,13,7.5,6.5,6.75,6.5,6.5,10,6.5,10,10,6.5,76.75,1
3,CO,MI,14,6.5,6.25,6.0,6.0,6.5,10,6.5,10,10,6.5,74.25,1
4,CO,MI,14,7.0,6.25,6.0,6.5,6.25,10,6.25,10,10,6.25,74.5,1
5,CO,MI,14,7.0,6.5,6.5,6.5,6.25,10,6.5,10,10,6.25,75.5,1
6,CO,FL,13,6.75,6.75,6.5,6.5,6.5,10,6.75,10,10,6.75,76.5,1
7,CO,FL,13,7.25,6.5,6.5,7.0,6.75,10,6.5,10,10,6.5,77.0,1
8,CO,FL,13,7.25,6.5,6.5,7.0,6.75,10,6.5,10,10,6.5,77.0,1
9,CO,FL,14,6.75,6.75,6.75,6.5,6.5,10,6.5,10,10,6.5,76.25,1


In [180]:
#List elements of cluster 2
df_nplus2 = display(list_with_clusters, 2)
df_nplus2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,CO,PE,13,6.50,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.50,2
1,CO,PE,13,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.00,2
2,CO,PE,13,6.75,6.00,6.00,6.00,6.00,10,6.00,10,10,6.00,72.75,2
3,CO,PE,13,6.75,6.50,6.50,6.00,6.00,10,6.00,10,10,6.00,73.75,2
4,CO,MI,13,6.00,7.00,6.25,6.00,6.00,10,6.25,10,10,6.25,73.75,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,OA,FL,18,6.50,6.00,6.00,6.00,6.25,10,6.00,10,10,6.00,72.75,2
78,OA,FL,18,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.25,72.25,2
79,OA,FL,18,6.00,6.00,6.00,6.00,6.00,10,6.00,10,10,6.50,72.50,2
80,OA,FL,18,6.25,6.25,6.25,6.00,6.25,10,6.00,10,10,6.25,73.25,2
