In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, pairwise_distances

In [None]:
#We load the dataseset vino.txt that contains wine correlated data
vino = pd.read_csv('vino.txt', delim_whitespace=True)
print(vino.info())
print(vino)


# **1559 wines and 6 variables**
1. **Fixed Acidity**: This is the portion consisting of the acidic substances present in a wine that are not volatilized but remain in the wine throughout its life. Being one of the fundamental parameters of wine, it determines the harvest period.
2. **Total Sulfur Dioxide**: This is the amount of free sulfur dioxide (SO2, sulfur dioxide) plus that bound to other chemical agents such as aldehydes, pigments, or sugars. It is the main additive used in oenology to prevent wine spoilage.
3. **pH**: This represents an index of real acidity. It is a parameter that undergoes variations during the various stages of winemaking and storage.
4. **Sulfites**: These are added to promote preservation. They possess antioxidant and antimicrobial properties that protect the wine from bacterial damage.
5. **Alcohol Content**: Indicates the percentage of alcohol by volume.
6. **Satisfaction Rating**: 1 = dissatisfied, 2 = satisfied

In [None]:
#We need to convert Grad into categorical data
vino['Grad'] = vino['Grad'].astype('category')
print(vino.dtypes)

In [None]:
#Exploratory Data Analysis to understand the dataset
print('Summary statistics:\n')
print(vino.describe())

In [None]:
#We have to Standardize the quantitative variables (all exept Grad), basicly the first 5 columns
vino2 = vino.iloc[:, 0:5].values
scaler = StandardScaler()
vino_std = scaler.fit_transform(vino2)

print(vino_std)

In [None]:
#We ne to perform euclidean distance and construct the matrix to find the optimal number of clusters
matrix = pairwise_distances(vino_std, metric='euclidean')
print(matrix)


#The dimension of the matrix of the dij observaions will be 1599X1599, lets take a look at the first 5 rows and columns
matrix5 = matrix[:5,:5]
print('Lets consider first 5 raws and columns:\n')
print(matrix5)

In [None]:
#Let's start the iterative process of k-means with a k=2
kmeans2 = KMeans(n_clusters = 2, n_init=20, random_state = 42)

#This parameter tells the algorithm to run the entire clustering process 20 different times, each with different random initializations,
#and then select the best result (the one with the lowest inertia or within-cluster sum of squares).
#By setting a specific value (like 42), you'll get the same results each time you run the algorithm. This is important for consistency in analysis and debugging.

kmeans2.fit(vino_std)

In [None]:
#Let's talk about the output of the clustering process 
print('Cluster Size:\n')
print(pd.Series(kmeans2.labels_).value_counts())
print('\nCluster Centers:\n')
print(kmeans2.cluster_centers_)

#Let's add a column to the dataset with the cluster labels (0 or 1)
vino['Cluster'] = kmeans2.labels_




In [None]:
#We analyze the statistics of each cluster and compare them
print('Summary of statistics by Cluster:\n')
for cluster_label in np.unique(vino['Cluster']):
    print(f'Cluster{cluster_label}:')
    print(vino[vino['Cluster'] == cluster_label].describe())


      


In [None]:
#Validate clustering quality
# We use the silhouette score 
silhouette_avg = silhouette_score(vino_std, kmeans2.labels_, metric='euclidean')
print('silhouette score for k=2:', silhouette_avg)

#The value is not good let's try to use the WSS (within-cluster sum of squares)


In [50]:
#Let's define the WSS function
def Wss(k):
    kmeans_model = KMeans(n_clusters=k, n_init=20, random_state=42)
    kmeans_model.fit(vino_std)
    return kmeans_model.inertia_

In [None]:
#elbow method: how yo choose the best k
ks = range(1,11)
wss_values = [Wss(k) for k in ks]

plt.plot(ks, wss_values, marker='o')
plt.xlabel('Group number (k)')
plt.ylabel('within-cluster sum of squares (Wss)')
plt.title('Elbow Methods')
plt.show()

#The WSS starts very high (around 8000) with k=1
#There's a sharp drop when moving from k=1 to k=2 (down to about 6100)
#The curve continues to decrease but at a decreasing rate
#The elbow point appears to be around k=4 or k=5, where the curve begins to flatten

In [None]:
#Let's evalute k=4, k=5 or k=6
k_values = [4, 5, 6 ]

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init=20, random_state=42)
    kmeans.fit(vino_std)
    silhouette_avg = silhouette_score(vino_std, kmeans.labels_, metric='euclidean')
    print(f'Silohouette score for k={k}: {silhouette_avg}')


#We can notice the best value is for k=5 on 0.242

