In [1]:
# Import Modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
import statistics
import fastcluster
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from mpl_toolkits.mplot3d import Axes3D
from itertools import combinations
from itertools import permutations
from scipy.spatial.distance import pdist, squareform
from mpl_toolkits.mplot3d import Axes3D

In [2]:
wine = pd.read_csv('wine.csv')
wine_noindex = pd.read_csv('wine.csv', index_col=False)  
#print(f'\t*Wine-Original : \n\n{wine.head()}')
wine_columns = wine.columns

categorical_features = ['Wine']
numerical_features = ['Alcohol', 'Malic.acid', 'Ash', 'Acl', 'Mg', 'Phenols', 'Flavanoids', 'Nonflavanoid.phenols', 'Proanth', 'Color.int', 'Hue','OD', 'Proline']
cat_data = wine_noindex[categorical_features]

scaler = MinMaxScaler(feature_range=(-1, 1))
x = pd.DataFrame(scaler.fit_transform(wine_noindex[numerical_features]))
x.columns = numerical_features
x_comb = pd.concat([cat_data, x], axis=1)
print(f'\n----> SCALED DATASET : \n\n{x_comb}')
s_row, s_col = x.shape

s_row, s_col = x.shape
print(f'\n*\tSamples : {s_row}, Dimensions : {len(wine.columns)}')

#sns.pairplot(data=x_comb, hue='Wine', height=2)
#plt.plot()


----> SCALED DATASET : 

     Wine   Alcohol  Malic.acid       Ash       Acl        Mg   Phenols  \
0       1  0.684211   -0.616601  0.144385 -0.484536  0.239130  0.255172   
1       1  0.142105   -0.588933 -0.165775 -0.938144 -0.347826  0.151724   
2       1  0.121053   -0.359684  0.401070 -0.175258 -0.326087  0.255172   
3       1  0.757895   -0.521739  0.219251 -0.360825 -0.065217  0.979310   
4       1  0.163158   -0.268775  0.614973  0.072165  0.043478  0.255172   
..    ...       ...         ...       ...       ...       ...       ...   
173     3  0.410526    0.940711  0.165775  0.020619 -0.456522 -0.517241   
174     3  0.247368    0.252964  0.197861  0.278351 -0.304348 -0.434483   
175     3  0.178947    0.399209 -0.037433 -0.030928  0.086957 -0.579310   
176     3  0.126316   -0.268775  0.080214 -0.030928  0.086957 -0.537931   
177     3  0.631579    0.328063  0.475936  0.432990 -0.434783 -0.262069   

     Flavanoids  Nonflavanoid.phenols   Proanth  Color.int       Hue  \
0

In [3]:
import itertools

def compute_qn(df):
    def qn(column):
        pairwise_diffs = [abs(a - b) for a, b in itertools.combinations(column, 2)]
        return 2.2219 * np.percentile(pairwise_diffs, 25)
    
    qn_values = df.apply(qn)
    return qn_values.to_frame()

# Use Qn instead of MAD
x_Qn = compute_qn(x)
print(f'\n ---> Qn scale estimator : \n\n{x_Qn}')

# Generate the correlation matrix elements
corr_elems = generate_corr_elems(x_Qn.columns)
print(f'\n ---> The elements of the corr matrix are : \n\n{corr_elems}')

# Compute the correlation matrix values
corr_values = compute_corr_matrix_values(x_Qn, corr_elems)
print(f'\n ---> The values of the corr_elems are : \n')
for i, value in zip(corr_elems, corr_values):
    print(f'*The median of the columns ({i[0]},{i[1]}) -> {value}')

# Construct the correlation matrix
corr_matrix = construct_corr_matrix(corr_values, x_Qn.shape[1])
print(f'\n ---> Correlation matrix : \n\n{corr_matrix}')



 ---> Qn scale estimator : 

                             0
Alcohol               0.456074
Malic.acid            0.307377
Ash                   0.261400
Acl                   0.343593
Mg                    0.289813
Phenols               0.459703
Flavanoids            0.421880
Nonflavanoid.phenols  0.419226
Proanth               0.350457
Color.int             0.360206
Hue                   0.397413
OD                    0.488330
Proline               0.402541


NameError: name 'generate_corr_elems' is not defined

In [None]:
# Eigen Value & Eigen Vector Decomposition

# Compute the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(corr_matrix)

# Sort the eigenvalues and eigenvectors in descending order
idx = eigenvalues.argsort()[::-1]   
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]

# Print the sorted eigenvalues and eigenvectors
print("Eigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

eig_vals_total = sum(eigenvalues)
explained_variance = [(i / eig_vals_total)*100 for i in eigenvalues]
explained_variance = np.round(explained_variance, 2)
cum_explained_variance = np.cumsum(explained_variance)
print('\nExplained variance             : {}'.format(explained_variance))

print('Cumulative explained variance  : {}'.format(cum_explained_variance))

In [None]:
# Data Transformation.

# Select 'k' no of eigen vectors
k = 6
W = eigenvectors[:, :k] # Projection matrix

# Transform the original dataset
X_proj = x.dot(W)
X_proj_fin = pd.concat([cat_data, X_proj], axis=1)
X_proj_fin.to_csv('columns_heatmap.csv', index=False)
print('\nNew projection:')
print(X_proj_fin)
#print(X_proj)

sns.pairplot(data=X_proj_fin, hue='Wine', height=2, palette="Set1")
plt.plot()

In [None]:

#Calculate the end points of the PC axes (scaled by the square root of the absolute eigenvalues)
pc_endpoints = eigenvectors * np.sqrt(np.abs(eigenvalues))
  
def Original():
    
    #Set the figure size
    plt.figure(figsize=(4, 4))

    df = pd.DataFrame(X_proj_fin)

    #Plot the DataFrame using Seaborn's scatterplot function
    sns.scatterplot(x=X_proj_fin[0], y=X_proj_fin[1],hue = X_proj_fin['Wine'],style=X_proj_fin['Wine'], palette="Set1", data=df)

    #Show the plot
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('COMAD - Wine dataset')
    plt.grid(True)
    plt.axis('equal')
    plt.show()
    

Original()
#Transformed()


In [None]:
X_proj_1D = X_proj.to_numpy()
#X_proj_1D = X_proj.flatten()[::-1]

# Calculate the condensed distance matrix
condensed_dist_matrix = pdist(X_proj_1D)

# Convert the condensed distance matrix to a square distance matrix
dist_matrix = pd.DataFrame(squareform(condensed_dist_matrix))

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(dist_matrix, cmap='Greys', annot=False, annot_kws={"size": 10})

title_fontsize = 5
label_fontsize = 5

plt.title('Heatmap', fontsize=title_fontsize)
plt.xlabel('Projection - X axis', fontsize=label_fontsize)
plt.ylabel('Projection - Y axis', fontsize=label_fontsize)

ax = plt.gca()
ax.tick_params(axis='x', labelsize=label_fontsize)
ax.tick_params(axis='y', labelsize=label_fontsize)

plt.show()

In [None]:
# Create a color mapping based on the species
species_color_mapping = X_proj_fin['Wine'].map({1: 'red', 2: 'blue', 3: 'green'})

# Create a clustermap using the distance matrix and fastcluster with matching colors
linkage_matrix = fastcluster.linkage(condensed_dist_matrix, method='ward')
sns.clustermap(dist_matrix, cmap='viridis', figsize=(5, 5), row_linkage=linkage_matrix, col_linkage=linkage_matrix,
               row_colors=species_color_mapping, col_colors=species_color_mapping)

# Show the plot
plt.show()

In [None]:
# Create a custom color mapping based on the species
species_color_mapping = X_proj_fin['Wine'].map({1: 'red', 2: 'blue', 3: 'green'})

# Plot the distance matrix heatmap with colored labels
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(dist_matrix, cmap='viridis', ax=ax,
            cbar_kws={'label': 'Distance'})

# Set the tick labels and colors
ax.set_xticks(np.arange(dist_matrix.shape[0]) + 0.5)
ax.set_yticks(np.arange(dist_matrix.shape[0]) + 0.5)
ax.set_xticklabels(species_color_mapping.index, rotation=90, fontsize=6)
ax.set_yticklabels(species_color_mapping.index, rotation=0, fontsize=6)

# Color the tick labels
for tick, color in zip(ax.get_xticklabels(), species_color_mapping):
    tick.set_color(color)

for tick, color in zip(ax.get_yticklabels(), species_color_mapping):
    tick.set_color(color)

# Show the plot
plt.title('Distance Matrix Heatmap with Colored Labels')
plt.show()

In [None]:
# Dump to CSV

dist_matrix.to_csv('COMAD_3_dim.csv', index=False)