In [None]:
# %% [markdown]
# Clustering Analysis — EastWestAirlines
# Author: Maddy
# Folder: D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\files
# Notebook: EDA → Preprocessing → KMeans / Agglomerative / DBSCAN → Evaluation → Save outputs

# %%
# 1. Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

try:
    import seaborn as sns
    sns.set(style='whitegrid')
    HAS_SEABORN = True
except Exception:
    HAS_SEABORN = False

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, silhouette_samples

print('imports ok')

# %%
# 2. Paths & load
BASE = Path(r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\files")
DATA_CSV = BASE / "EastWestAirlines.csv"  # update if different
OUT_DIR = BASE
OUT_DIR.mkdir(parents=True, exist_ok=True)

if not DATA_CSV.exists():
    raise FileNotFoundError(f"Data file not found at {DATA_CSV}. Put the CSV there or edit DATA_CSV.")

print('Loading:', DATA_CSV)
df = pd.read_csv(DATA_CSV)
print('shape:', df.shape)
print('columns:', df.columns.tolist())

# %%
# 3. Quick EDA
print('\nMissing values per column:\n', df.isnull().sum())
print('\nNumeric dtypes:')
print(df.select_dtypes(include=[np.number]).columns.tolist())

df.describe().T.to_csv(OUT_DIR / 'descriptive_stats.csv')
print('Saved descriptive_stats.csv')

# Histograms (sample if many columns)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(figsize=(12,8))
df[num_cols].hist(bins=30, figsize=(12,8))
plt.suptitle('Histograms (raw)')
plt.tight_layout()
plt.savefig(OUT_DIR / 'histograms_raw.png', dpi=150)
plt.close()
print('Saved histograms_raw.png')

# Boxplots
plt.figure(figsize=(12,6))
if HAS_SEABORN:
    sns.boxplot(data=df[num_cols], orient='h')
else:
    df[num_cols].plot(kind='box', vert=False, figsize=(12,6))
plt.title('Boxplots (raw)')
plt.tight_layout()
plt.savefig(OUT_DIR / 'boxplots_raw.png', dpi=150)
plt.close()
print('Saved boxplots_raw.png')

# %%
# 4. Preprocessing: drop ID, remove outliers via IQR, scale
if 'ID#' in df.columns:
    df = df.drop(columns=['ID#'])

# ensure numeric-only features for clustering (keep Award? aside if present)
label_col = None
for possible in ['Award?', 'Award', 'award', 'Target']:
    if possible in df.columns:
        label_col = possible
        break

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if label_col in numeric_cols:
    numeric_cols.remove(label_col)

print('Numeric features used for clustering:', numeric_cols)

# Remove outliers by iteratively applying IQR filter across numeric columns
def remove_outliers_iqr(df_in, cols):
    df_out = df_in.copy()
    for col in cols:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_out = df_out[(df_out[col] >= lower) & (df_out[col] <= upper)]
    return df_out

print('Original rows:', df.shape[0])
df_no_out = remove_outliers_iqr(df, numeric_cols)
print('After IQR outlier removal rows:', df_no_out.shape[0])

# scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_no_out[numeric_cols].fillna(df_no_out[numeric_cols].median()))
scaled_df = pd.DataFrame(X_scaled, columns=numeric_cols, index=df_no_out.index)
scaled_df.to_csv(OUT_DIR / 'eastwest_scaled_numeric.csv', index=False)
print('Saved eastwest_scaled_numeric.csv')

# %%
# 5. PCA for visualization
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print('PCA explained variance (first 2):', pca.explained_variance_ratio_)

plt.figure(figsize=(8,6))
if label_col:
    labels = df_no_out[label_col].astype(str).values
    uniq = np.unique(labels)
    cmap = plt.cm.tab10
    color_map = {v:i for i,v in enumerate(uniq)}
    colors = [color_map[v] for v in labels]
    plt.scatter(X_pca[:,0], X_pca[:,1], c=colors, s=10, alpha=0.6)
    # legend
    handles = [plt.Line2D([0],[0], marker='o', color='w', label=str(u), markerfacecolor=plt.cm.tab10(i), markersize=6) for i,u in enumerate(uniq)]
    plt.legend(handles=handles, title=label_col)
else:
    plt.scatter(X_pca[:,0], X_pca[:,1], alpha=0.6, s=10)

plt.xlabel('PC1'); plt.ylabel('PC2'); plt.title('PCA (2D)')
plt.tight_layout()
plt.savefig(OUT_DIR / 'pca_2d.png', dpi=150)
plt.close()
print('Saved pca_2d.png')

# %%
# 6. KMeans: elbow + silhouette sweep
from collections import defaultdict
results_kmeans = []
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    results_kmeans.append((k, sil))

k_vals, sils = zip(*results_kmeans)
pd.DataFrame(results_kmeans, columns=['k','silhouette']).to_csv(OUT_DIR / 'kmeans_silhouette.csv', index=False)
print('KMeans silhouette results saved to kmeans_silhouette.csv')

# choose best k by silhouette
best_k, best_sil = max(results_kmeans, key=lambda t: t[1])
print('KMeans: best k by silhouette in range 2-10:', best_k, 'silhouette:', best_sil)

# fit best k and save cluster means
kmeans_best = KMeans(n_clusters=best_k, random_state=42, n_init=10).fit(X_scaled)
labels_k = kmeans_best.labels_
scaled_df['kmeans_cluster'] = labels_k
cluster_means = scaled_df.groupby('kmeans_cluster').mean()
cluster_means.reset_index().to_csv(OUT_DIR / f'kmeans_k{best_k}_cluster_feature_means.csv', index=False)
print('Saved cluster means for kmeans_k', best_k)

# PCA scatter colored by KMeans
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_k, cmap='tab10', s=10, alpha=0.7)
plt.title(f'KMeans k={best_k} PCA')
plt.savefig(OUT_DIR / f'kmeans_k{best_k}_pca.png', dpi=150)
plt.close()

# silhouette score
print('KMeans silhouette:', best_sil)

# %%
# 7. Agglomerative clustering experiments
agg_results = {}
linkages = ['ward','complete','average']
for link in linkages:
    # skip ward if not compatible with metric other than euclidean (we use default)
    agg = AgglomerativeClustering(n_clusters=best_k, linkage=link)
    labels_a = agg.fit_predict(X_scaled)
    try:
        sil = silhouette_score(X_scaled, labels_a)
    except Exception:
        sil = np.nan
    agg_results[link] = (labels_a, sil)
    print(f'Agglomerative ({link}) silhouette: {sil}')
    # save cluster means
    df_temp = pd.DataFrame(X_scaled, columns=numeric_cols)
    df_temp['cluster'] = labels_a
    df_temp.groupby('cluster').mean().reset_index().to_csv(OUT_DIR / f'agg_{link}_cluster_feature_means.csv', index=False)

# pick best linkage by silhouette
best_link, best_link_val = max(agg_results.items(), key=lambda kv: (kv[1][1] if not np.isnan(kv[1][1]) else -999))
print('Best agglomerative linkage:', best_link, 'silhouette:', best_link_val[1])

# PCA scatter color by best agg
labels_agg_best = agg_results[best_link][0]
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_agg_best, cmap='tab10', s=10, alpha=0.7)
plt.title(f'Agglomerative ({best_link}) PCA')
plt.savefig(OUT_DIR / f'agg_{best_link}_pca.png', dpi=150)
plt.close()

# %%
# 8. DBSCAN parameter sweep (eps, min_samples)
dbscan_results = []
eps_vals = [0.3, 0.5, 0.7, 0.9, 1.1]
min_samples_vals = [4,6,8]
for eps in eps_vals:
    for ms in min_samples_vals:
        db = DBSCAN(eps=eps, min_samples=ms)
        lab = db.fit_predict(X_scaled)
        # number of clusters (exclude noise label -1)
        n_clusters = len(set(lab)) - (1 if -1 in lab else 0)
        # require at least 2 clusters for silhouette
        if n_clusters >= 2:
            sil = silhouette_score(X_scaled, lab)
        else:
            sil = -999
        noise = np.sum(lab == -1)
        dbscan_results.append({'eps':eps, 'min_samples':ms, 'n_clusters':n_clusters, 'silhouette':sil, 'noise':int(noise)})

pd.DataFrame(dbscan_results).to_csv(OUT_DIR / 'dbscan_sweep.csv', index=False)
print('Saved dbscan_sweep.csv')

# Best DBSCAN by silhouette
df_db = pd.DataFrame(dbscan_results)
best_db = df_db.loc[df_db['silhouette'].idxmax()]
print('Best DBSCAN:', best_db.to_dict())

# fit best DBSCAN and save cluster means
db_best = DBSCAN(eps=float(best_db.eps), min_samples=int(best_db.min_samples)).fit(X_scaled)
labels_db = db_best.labels_
pd.DataFrame(X_scaled, columns=numeric_cols).assign(cluster=labels_db).groupby('cluster').mean().reset_index().to_csv(OUT_DIR / f"dbscan_eps{best_db.eps}_ms{int(best_db.min_samples)}_cluster_feature_means.csv", index=False)

# PCA scatter for DBSCAN
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_db, cmap='tab10', s=10, alpha=0.7)
plt.title(f'DBSCAN eps={best_db.eps} ms={best_db.min_samples}')
plt.savefig(OUT_DIR / f'dbscan_eps{best_db.eps}_ms{int(best_db.min_samples)}_pca.png', dpi=150)
plt.close()

# %%
# 9. Summary outputs saved
print('\n--- FINAL SUMMARY ---')
print(f'KMeans k={best_k} silhouette={best_sil:.4f}')
print('Agglomerative best linkage:', best_link, 'silhouette:', agg_results[best_link][1])
print('DBSCAN best:', best_db.to_dict())

print('\nSaved outputs to:', OUT_DIR)
print('Files created include: eastwest_scaled_numeric.csv, descriptive_stats.csv, histograms_raw.png, boxplots_raw.png, correlation_heatmap.png (if computed), pca_2d.png, kmeans and agg and dbscan CSVs and PCA plots')

# End of notebook
