In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_wine

# Load the dataset
print("Loading dataset...")
wine = load_wine()
data = pd.DataFrame(data=wine.data, columns=wine.feature_names)
target = pd.Series(wine.target, name='target')

# Display first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Data Preprocessing
# Check for missing values
print("Checking for missing values:")
print(data.isnull().sum())

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Perform PCA
print("Performing PCA...")
pca = PCA()
data_pca = pca.fit_transform(data_scaled)

# Determine the optimal number of principal components
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Principal Components')
plt.grid(True)
plt.show()

# Choosing the number of components that explain 95% variance
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"Number of components that explain 95% variance: {n_components}")

# Apply PCA with the chosen number of components
pca = PCA(n_components=n_components)
data_pca_reduced = pca.fit_transform(data_scaled)

# Scatter plot of the first two principal components
plt.figure(figsize=(10, 6))
plt.scatter(data_pca_reduced[:, 0], data_pca_reduced[:, 1], c=target, cmap='viridis', edgecolor='k')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - Scatter Plot of First Two Principal Components')
plt.colorbar(label='Target')
plt.show()

# Clustering with K-Means
print("Performing K-Means clustering...")
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(data_pca_reduced)

# Evaluate clustering performance
silhouette_avg = silhouette_score(data_pca_reduced, clusters)
print(f"Silhouette Score: {silhouette_avg}")

# Add cluster labels to the dataframe
data_pca_reduced_df = pd.DataFrame(data_pca_reduced, columns=[f'PC{i+1}' for i in range(n_components)])
data_pca_reduced_df['Cluster'] = clusters
data_pca_reduced_df['Actual'] = target

# Performance metrics table
performance_metrics = pd.DataFrame({
    'Cluster': np.unique(clusters),
    'Cluster Size': [np.sum(clusters == i) for i in np.unique(clusters)],
    'Actual Class Distribution': [np.sum(target[clusters == i] == j) for i in np.unique(clusters) for j in np.unique(target)]
}).reset_index(drop=True)

print("Performance metrics for the clustering algorithm:")
print(performance_metrics)

# Save performance metrics and visualizations if required
data_pca_reduced_df.to_csv('pca_reduced_data.csv', index=False)
performance_metrics.to_csv('clustering_performance_metrics.csv', index=False)

# Interpretation
print("""
Interpretation:
1. The cumulative explained variance ratio plot helps to determine the number of principal components to retain. 
2. The scatter plot of the first two principal components visualizes how well the data is separated into clusters.
3. The silhouette score provides a measure of how similar objects are within the same cluster compared to other clusters.
4. Performance metrics table shows the distribution of clusters and actual class distribution, which helps in assessing clustering performance.
""")
