In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Loading the data and showing the data

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
Df = wine_quality.data.features 
y = wine_quality.data.targets 
  
# # metadata 
# print(wine_quality.metadata) 
  
# # variable information 
print(wine_quality.variables) 

### Showing the description

In [None]:
Df

In [None]:
Df.describe()

### Dropping the data if is any Nan value

In [None]:

if Df.isnull().values.any():
    Df = Df.dropna(inplace=True)
    
Df

## Transforming the data

In [None]:
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data
scaled_data = scaler.fit_transform(Df)

# Create a new dataframe with the transformed data
transformed_df = pd.DataFrame(scaled_data, columns=Df.columns)

# Display the transformed dataframe
transformed_df

### Function to get the best number of clusters using the Elbow Method

In [None]:

def get_k_means_score(data, max_k):
    means = []
    scores = []
    
    for k in range(1, max_k+1):
        # Create a k-means instance with k clusters
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        
        means.append(k)
        scores.append(kmeans.inertia_)
        
    # plot the results
    plt.plot(means, scores, marker='o', color='r');
    plt.xlabel('K');
    plt.ylabel('Score');
    plt.title('Elbow Method');
    plt.show()

In [None]:
# using the function to get the best number of clusters
get_k_means_score(transformed_df[['fixed_acidity', 'citric_acid']], 10)

As we can see, the best number of clusters is 4, so we will use that number to create the KMeans model. The curve is almost flat after 5 clusters, so we will not get much more information if we increase the number of clusters.

In [None]:
# now we can use the best number of clusters to create the model
# Create a k-means instance with 4 clusters
kmeans = KMeans(n_clusters=4)

# Fit the k-means object to the data
kmeans.fit(transformed_df[['fixed_acidity', 'citric_acid']])

# set a column with the labels
transformed_df['Labels'] = kmeans.labels_

# show the dataframe
transformed_df

## Visualizing the clusters in 2D

In [None]:
# plotting the results by the first two principal components
plt.scatter(transformed_df['fixed_acidity'], transformed_df['citric_acid'], c=transformed_df['Labels'])

# show the centroids for each cluster
centroids = kmeans.cluster_centers_

plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50, marker='x')

plt.title('Clusters')
plt.xlabel('fixed_acidity')
plt.ylabel('citric_acid')
plt.show()



### Reducing the data using PCA and analyzing the the number of components to use in the PCA

In [None]:
# Create an instance of PCA
pca = PCA()

# Fit PCA to the scaled data and plot the cumulative explained variance ratio for each component
pca.fit(scaled_data)
plt.plot(range(1,12), pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Number of Components')
plt.show()

print(pca.explained_variance_ratio_.cumsum())

In [None]:
pca = PCA(n_components=7)
# Transform the scaled data
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

# Create a new dataframe with the pca data
pca_df = pd.DataFrame(pca_data, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])

# Display the reduced dataframe
pca_df

### Finding the best number of clusters for the reduced data

In [None]:
get_k_means_score(pca_df[['PC1', 'PC3']], 10)

In [None]:
# now we can use the best number of clusters to create the model
# Create a k-means instance with 4 clusters
kmeans = KMeans(n_clusters=4)

# Fit the k-means object to the data
kmeans.fit(pca_df[['PC1', 'PC3']])

# set a column with the labels
pca_df['Labels'] = kmeans.labels_


### Visualizing the clusters in 2D of the reduced data

In [None]:
# plotting the results by the first two principal components
plt.scatter(pca_df['PC1'], pca_df['PC3'], c=pca_df['Labels'])
plt.xlabel('PC1')
plt.ylabel('PC3')

# show the centroids for each cluster
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50, marker='x')
plt.title('Clusters')
plt.show()