In [None]:
# Import Required Packages
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


# Loading and examining the dataset
penguins_df= pd.read_csv("penguins.csv")
penguins_df.head()
penguins_df.shape

# Checking for null values
penguins_df.isna().sum()

# Removing null values
penguins_df.dropna(inplace=True)
penguins_df.shape

# Checking for outliers
penguins_df.boxplot()
plt.show()

# Removing outliers thorugh boolean indexing
lower_threshold = penguins_df["flipper_length_mm"].quantile(0.05)
upper_threshold = penguins_df["flipper_length_mm"].quantile(0.95)
penguins_clean = penguins_df[
                (penguins_df["flipper_length_mm"] > lower_threshold) & 
                (penguins_df["flipper_length_mm"] < upper_threshold)].copy()
print(penguins_clean)

# Pre processing the cleaned data(categorical coolumns) using one-hot encoding
cat_cols = penguins_clean.select_dtypes(include=['object']).columns
penguins_clean = pd.get_dummies(penguins_clean, columns=cat_cols, drop_first=True)

# Applying standard scaling method
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
penguins_scaled= scaler.fit_transform(penguins_clean)

# Applying PCA on the scaled dataset
model = PCA()
model.fit(penguins_scaled)
print(model.components_)
print(model.explained_variance_ratio_)
n_components = max(1,sum(model.explained_variance_ratio_ > 0.10))
model= PCA(n_components)
penguins_PCA = model.fit_transform(penguins_scaled)

# Finding  optimal number of Clusters for the K-means clustering algortihm using Elbow method
inertia_vals = []
for k in range(1,10):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(penguins_PCA)
    inertia_vals.append(km.inertia_)
    
plt.plot(range(1, 10), inertia_vals, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('list of Inertia')
plt.title('Elbow Method')
plt.show()

# Applying K-means algorithm with the optimal number of clusters
n_clusters = 4
kmeans = KMeans(n_clusters= n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(penguins_PCA)

# Visualizing the clusters using the first 2 principle components
plt.scatter(penguins_PCA[:, 0], penguins_PCA[:, 1], c=labels)
plt.xlabel("First Principle Component")
plt.ylabel("Second Principle Component")
plt.title(f"k-means Clustering (k= {n_clusters})")
plt.show()

# Finally,creating statistical dataframe for each cluster
penguins_clean['label'] = labels
numeric_columns = penguins_clean.select_dtypes(include=["number"])
stat_penguins = penguins_clean.groupby('label')[numeric_columns.columns].mean()
print(stat_penguins)

