In [1]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


In [2]:
# Read in the CSV file as a Pandas DataFrame
wine_data = pd.read_csv(
    Path("Resource/winequality-red.csv"))

# Review the DataFrame
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## RESIDUAL SUGAR AND ALCOHOL LEVELS

In [3]:
wine_data_subset = wine_data[['residual sugar', 'alcohol', 'quality']]

In [4]:
wine_data_subset = wine_data_subset.dropna()
wine_data_subset

Unnamed: 0,residual sugar,alcohol,quality
0,1.9,9.4,5
1,2.6,9.8,5
2,2.3,9.8,5
3,1.9,9.8,6
4,1.9,9.4,5
...,...,...,...
1594,2.0,10.5,5
1595,2.2,11.2,6
1596,2.3,11.0,6
1597,2.0,10.2,5


## UNSUPERVISED CLUSTERING ANALYSIS 
Used to analyze if certain clusters are accually associated with specific quality scores.

In [5]:
scaler = StandardScaler()
wine_data_subset_scaled = scaler.fit_transform(wine_data_subset)

In [6]:
model=KMeans(n_clusters=2, random_state=42)
model.fit(wine_data_subset_scaled)

In [7]:
# Create a list to store inertia values
inertia = []
# Create a list to store the values of k
k = list(range(1, 11))

In [8]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(wine_data_subset_scaled)
    inertia.append(k_model.inertia_)

In [9]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary

df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()
# Review the DataFrame


Unnamed: 0,k,inertia
0,1,4797.0
1,2,3133.152417
2,3,2624.153024
3,4,1738.704211
4,5,1514.993847


In [10]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [11]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
model=KMeans(n_clusters=2, random_state=42)

# Fit the model
model.fit(wine_data_subset)

# Make predictions
k_2= model.predict(wine_data_subset)

# Create a copy of the DataFrame and name it as spread_df_predictions

wine_data_subset_predictions = wine_data_subset.copy()
# Add a class column with the labels to the spread_df_predictions DataFrame

wine_data_subset_predictions["clusters"] = k_2

In [12]:
# Plot the clusters
wine_data_subset_predictions.hvplot.scatter(
    x="quality",
    y="alcohol",
    by="clusters"
).opts(yformatter="%.0f")

Alcohol levels 9-12 are associated with quality level 6.

In [13]:
wine_data_subset_predictions.hvplot.scatter(
    x="quality",
    y="residual sugar",
    by="clusters"
).opts(yformatter="%.0f")

Residual sugar levels 1-5 are associated with quality level 6.

##TRAIN TEST SPLIT

In [14]:
y = wine_data_subset["quality"]

# Separate the X variable, the features
X = wine_data_subset.copy()
X.drop("quality", axis=1, inplace=True)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)

## PRINCIPAL COMPONENT ANALYSIS
Used to optimize the cluster analysis. 

In [16]:
# Create a PCA model.
pca1 = PCA(n_components=3)

In [17]:
# Use the PCA model with `fit_transform` to reduce the original scaled DataFrame
# down to three principal components.
wine_data_pca = pca1.fit_transform(wine_data_subset_scaled)

# View the scaled PCA data
wine_data_pca[:5]

array([[-1.26931768, -0.35493273, -0.10487811],
       [-0.9634842 ,  0.12896783,  0.13992583],
       [-0.9809598 , -0.08296736,  0.14889141],
       [-0.1328548 , -0.47435621, -0.712721  ],
       [-1.26931768, -0.35493273, -0.10487811]])

In [18]:
pca1.explained_variance_ratio_

array([0.4931391 , 0.33253133, 0.17432956])

In [19]:
# Create a new DataFrame with the PCA data.
wine_data_pca_df = pd.DataFrame(
    wine_data_pca, columns=["PC1", "PC2", "PC3"]
)

# Display the scaled PCA DataFrame
wine_data_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
0,-1.269318,-0.354933,-0.104878
1,-0.963484,0.128968,0.139926
2,-0.98096,-0.082967,0.148891
3,-0.132855,-0.474356,-0.712721
4,-1.269318,-0.354933,-0.104878


In [20]:
inertia = []
k = list(range(1, 11))

In [21]:
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(wine_data_pca_df)
    inertia.append(k_model.inertia_)

In [22]:
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,4797.0
1,2,3133.152417
2,3,2624.153024
3,4,1738.704211
4,5,1514.993847


In [23]:
PCA_elbow_curve = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow CurvePCA", 
    xticks=k
)
PCA_elbow_curve

In [24]:
# Initialize the K-Means model using the best value for k
model_pca =KMeans(n_clusters=2, random_state=1)

In [25]:
model_pca.fit(wine_data_pca_df)

In [26]:
# Predict the clusters to group the cryptocurrencies using the scaled PCA DataFrame
k_2_pca = model_pca.predict(wine_data_pca_df)

# Print the resulting array of cluster values.
k_2_pca

array([0, 0, 0, ..., 1, 0, 1])

In [30]:
# Create a copy of the scaled PCA DataFrame
wine_data_pca_predictions_df = wine_data_pca_df.copy()

# Add a new column to the copy of the PCA DataFrame with the predicted clusters

wine_data_pca_predictions_df['k_2_pca'] = k_2_pca
# Display the copy of the scaled PCA DataFrame
wine_data_pca_predictions_df.head()

Unnamed: 0,PC1,PC2,PC3,k_2_pca
0,-1.269318,-0.354933,-0.104878,0
1,-0.963484,0.128968,0.139926,0
2,-0.98096,-0.082967,0.148891,0
3,-0.132855,-0.474356,-0.712721,0
4,-1.269318,-0.354933,-0.104878,0


In [28]:
wine_plot_pca = wine_data_pca_predictions_df.hvplot.scatter(
    x="PC1", 
    y="PC2", 
    by="k_2_pca",
    hover_cols = ["coin_id"]).opts(yformatter="%.0f")

wine_plot_pca

## CONCLUSIONS
Do the physicochemical properties residual sugar and alcohol level contribute to specific quality scores of red wine?
Residual sugar levels range 0.9(low)-15.5(high)
Alcohol level range 8.4(low)- 14.9(high) 
Quality level range 3(low)-8(high) 
Mid levels of residual sugar 6-7 in combination with mid levels of alcohol content (10-12) are more associated with high quality 6-7.