In [None]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.cluster import KMeans  
from sklearn.metrics import silhouette_score
import seaborn as sns

In [None]:
data = pd.read_csv('Wine2.csv')

In [None]:
## scale the data

mms = MinMaxScaler()
data.loc[:, data.columns != 'class'] = mms.fit_transform(data.loc[:, data.columns != 'class'])

In [None]:
kmeans = KMeans(n_clusters= 3)
kmeans.fit(data.drop('class',axis=1))


In [None]:
y_kmeans = kmeans.predict(data.drop('class',axis=1))

In [None]:
# plot the cluster centers and samples 
sns.scatterplot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
                marker='+', 
                color='black', 
                s=200);
sns.scatterplot(data['Alcohol'], data['Malic_acid'], hue=y_kmeans, 
                palette=sns.color_palette("Set1", n_colors=3))

## calculate the distance from the center in order to find outliers

In [None]:
# squared distance to cluster center
X_dist = kmeans.transform(data.drop('class',axis=1))**2

df = pd.DataFrame(X_dist.sum(axis=1).round(2), columns=['sqdist'])
df['label'] = y_kmeans

In [None]:
max_indices

In [None]:
data.iloc[max_indices, 2]

In [None]:
data.iloc[max_indices, 3]

In [None]:
# for each cluster, find the furthest point
max_indices = []
for label in np.unique(kmeans.labels_):
    X_label_indices = np.where(y_kmeans==label)[0]
    max_label_idx = X_label_indices[np.argmax(X_dist[y_kmeans==label].sum(axis=1))]
    max_indices.append(max_label_idx)

# replot, but highlight the furthest point
sns.scatterplot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
                marker='+', 
                color='black', 
                s=200)

sns.scatterplot(data.iloc[:, 2], data.iloc[:, 3], hue=y_kmeans, 
                palette=sns.color_palette("Set1", n_colors=3))
# highlight the furthest point in black
sns.scatterplot(data.iloc[max_indices, 2], data.iloc[max_indices, 3], color='black');

## Drawbacks of k-means clustering
* Difficult to predict number of clusters 
* It doesn't work well when there is a global cluster like in our dataset
* Different initial partitions can result in different final clusters
* It does not work well with clusters (in the original data) of different sizes and densities - it assumes that all clusters have same variance