In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
# Load your dataset (from the "data" sheet)
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines.csv"
df = pd.read_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines.csv")

In [None]:
# Drop ID# column if present
if "ID#" in df.columns:
    df = df.drop(columns=["ID#"])

In [None]:
print("Original shape:", df.shape)

In [None]:
# Step 1: Handle missing values (none in this dataset, but good to keep)
print("\nMissing values per column:\n", df.isna().sum())

In [None]:
# Step 2: Outlier removal using IQR
def remove_outliers_iqr(data):
    df_out = data.copy()
    for col in df_out.select_dtypes(include=[np.number]).columns:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_out = df_out[(df_out[col] >= lower) & (df_out[col] <= upper)]
    return df_out

In [None]:
df_no_outliers = remove_outliers_iqr(df)
print("\nAfter outlier removal:", df_no_outliers.shape)

In [None]:
# Step 3: Scaling numeric features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_no_outliers.drop(columns=["Award?"]))
df_scaled = pd.DataFrame(scaled_data, columns=df_no_outliers.drop(columns=["Award?"]).columns)

In [None]:
# Keep Award? column separately (optional for clustering)
df_scaled["Award?"] = df_no_outliers["Award?"].values

In [None]:
# Save preprocessed dataset to the same folder
output_path = r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines_Preprocessed.xlsx"
df_scaled.to_excel(output_path, index=False)

In [None]:
print(f"\nPreprocessed dataset saved to: {output_path}")

In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Optional nicer plots if seaborn is available
try:
    import seaborn as sns
    sns.set(style="whitegrid")
    HAS_SEABORN = True
except Exception:
    HAS_SEABORN = False

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# -------------- CONFIG --------------
# Change this to your actual path if needed.
# file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines.csv"
# If you prefer CSV, comment above and uncomment below:
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines.csv"

In [None]:
out_folder = Path(file_path).parent  # save outputs next to the data file
os.makedirs(out_folder, exist_ok=True)

In [None]:
# -------------- Load data --------------
def load_data(fp):
    fp = Path(fp)
    if fp.suffix.lower() in [".xlsx", ".xls"]:
        # try to be explicit about engine to avoid pandas warnings
        try:
            df = pd.read_excel(fp, sheet_name="data", engine="openpyxl")
        except Exception as e:
            # fallback: try without engine (pandas will try its default)
            print("Warning: read_excel with engine failed:", e)
            df = pd.read_excel(fp, sheet_name="data")
    elif fp.suffix.lower() == ".csv":
        df = pd.read_csv(fp)
    else:
        raise ValueError("Unsupported file type: " + str(fp.suffix))
    return df

In [None]:
print("Loading data from:", file_path)
df = load_data(file_path)
print("Raw shape:", df.shape)
print("Columns:", df.columns.tolist())

In [None]:
# If ID# exists, drop it (identifier)
if 'ID#' in df.columns:
    df = df.drop(columns=['ID#'])
    print("Dropped ID# column. New shape:", df.shape)

In [None]:
# -------------- Quick checks --------------
print("\nMissing values per column:\n", df.isna().sum())

In [None]:
# Keep only numeric features for clustering/EDA visuals (but keep Award? separately)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nNumeric columns detected:", numeric_cols)

In [None]:
if len(numeric_cols) < 2:
    raise SystemExit("Not enough numeric columns found for EDA/Clustering. Please check the data sheet.")

In [None]:
df_num = df[numeric_cols].copy()

In [None]:
# -------------- Descriptive statistics --------------
desc = df_num.describe().T
desc_file = out_folder / "eastwest_descriptive_stats.csv"
desc.to_csv(desc_file)
print(f"\nDescriptive statistics saved to: {desc_file}")
print(desc)

In [None]:
# -------------- Histograms (before scaling) --------------
plt.figure(figsize=(12, 8))
df_num.hist(bins=30, figsize=(12, 8))
plt.suptitle("Histograms of numeric features (raw)", y=0.95)
plt.tight_layout()
hist_file = out_folder / "histograms_raw.png"
plt.savefig(hist_file, dpi=150)
plt.close()
print("Saved histograms to:", hist_file)

In [None]:
# -------------- Boxplots (detect outliers visually) --------------
plt.figure(figsize=(12, 6))
if HAS_SEABORN:
    sns.boxplot(data=df_num, orient="h")
else:
    df_num.plot(kind="box", vert=False, figsize=(12,6))
plt.title("Boxplots of numeric features (raw)")
plt.tight_layout()
box_file = out_folder / "boxplots_raw.png"
plt.savefig(box_file, dpi=150)
plt.close()
print("Saved boxplots to:", box_file)

In [None]:
# -------------- Correlation heatmap --------------
corr = df_num.corr()
corr_file_csv = out_folder / "eastwest_correlation.csv"
corr.to_csv(corr_file_csv)
plt.figure(figsize=(10, 8))
if HAS_SEABORN:
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", square=True)
else:
    plt.imshow(corr, cmap="coolwarm", interpolation='nearest')
    plt.colorbar()
    plt.xticks(range(len(corr)), corr.columns, rotation=45, ha='right')
    plt.yticks(range(len(corr)), corr.columns)
plt.title("Correlation matrix")
plt.tight_layout()
corr_img = out_folder / "correlation_heatmap.png"
plt.savefig(corr_img, dpi=150)
plt.close()
print("Saved correlation heatmap to:", corr_img)
print("Correlation CSV saved to:", corr_file_csv)

In [None]:
# -------------- Pairwise scatter (sampled if many rows) --------------
max_pairs = 6  # limit number of vars for pairplot/pairs to keep plots readable
cols_for_pairs = numeric_cols if len(numeric_cols) <= max_pairs else numeric_cols[:max_pairs]
sample = df_num[cols_for_pairs].sample(n=min(1000, df_num.shape[0]), random_state=42)  # sampling for speed

In [None]:
if HAS_SEABORN:
    sns.pairplot(sample, diag_kind="hist", plot_kws=dict(s=20, alpha=0.6))
    pair_file = out_folder / "pairplot_sample.png"
    plt.gcf().set_size_inches(12, 10)
    plt.savefig(pair_file, dpi=150)
    plt.close()
else:
    pd.plotting.scatter_matrix(sample, alpha=0.5, figsize=(12, 12), diagonal='hist')
    pair_file = out_folder / "scatter_matrix_sample.png"
    plt.savefig(pair_file, dpi=150)
    plt.close()
print("Saved pairwise/sample scatter to:", pair_file)

In [None]:
# -------------- PCA for 2D visualization (with scaling) --------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_num.fillna(df_num.median()))

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print("\nPCA explained variance ratios (first 2):", pca.explained_variance_ratio_)

In [None]:
# Scatter plot of PCA (color by Award? if present)
plt.figure(figsize=(8,6))
if 'Award?' in df.columns:
    # try to use Award? as categorical coloring if present
    labels = df['Award?'].astype(str).values
    # map labels to integers for color mapping
    unique_labels = np.unique(labels)
    label_map = {lab:i for i,lab in enumerate(unique_labels)}
    colors = [label_map[l] for l in labels]
    sc = plt.scatter(X_pca[:,0], X_pca[:,1], c=colors, alpha=0.6, s=20)
    # legend
    handles = []
    for lab, i in label_map.items():
        handles.append(plt.Line2D([0],[0], marker='o', color='w', label=str(lab),
                                  markerfacecolor=plt.cm.tab10(i % 10), markersize=6))
    plt.legend(handles=handles, title="Award?")
else:
    plt.scatter(X_pca[:,0], X_pca[:,1], alpha=0.6, s=20)

In [None]:
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA (2D) visualization of numeric features")
pca_file = out_folder / "pca_2d.png"
plt.tight_layout()
plt.savefig(pca_file, dpi=150)
plt.close()
print("Saved PCA 2D plot to:", pca_file)

In [None]:
# -------------- Save scaled numeric data for clustering stage --------------
scaled_df = pd.DataFrame(X_scaled, columns=df_num.columns, index=df_num.index)
scaled_out = out_folder / "eastwest_scaled_numeric.csv"
scaled_df.to_csv(scaled_out, index=False)
print("Saved scaled numeric features to:", scaled_out)

In [None]:
# -------------- Summary print for report --------------
print("\n--- EDA SUMMARY ---")
print("Rows:", df.shape[0], "Numeric cols:", len(numeric_cols))
print("Saved outputs in:", out_folder)
print("Files created:")
for f in [hist_file, box_file, corr_img, pair_file, pca_file, scaled_out, desc_file, corr_file_csv]:
    print(" -", f)
print("\nYou can include the above PNGs and CSVs in your report. Next, we can run clustering on the scaled features (eastwest_scaled_numeric.csv).")

cluster2.py
Clustering experiments: KMeans, Hierarchical (Agglomerative), DBSCAN
Saves plots and cluster summaries next to the data file.
Run in your venv: python cluster2.py

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Optional prettier plots
try:
    import seaborn as sns
    sns.set(style="whitegrid")
    HAS_SEABORN = True
except:
    HAS_SEABORN = False

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.neighbors import NearestNeighbors

In [None]:
# -------------- CONFIG --------------
DATA_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\EastWestAirlines.csv"
# If you already have the scaled csv from earlier: set SCALED_CSV to that path; script will use it.
SCALED_CSV = Path(DATA_PATH).parent / "eastwest_scaled_numeric.csv"

In [None]:
OUT_DIR = Path(DATA_PATH).parent
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# -------------- LOAD / PREPROCESS --------------
def load_prepared():
    # Always load from the scaled CSV (skip Excel)
    scaled_df = pd.read_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\8 clustering\Clustering\eastwest_scaled_numeric.csv")
    
    # Use the same data for raw_num (no scaling reversal needed for clustering visuals)
    raw_num = scaled_df.copy()
    return raw_num, scaled_df

In [None]:
raw_num, X_scaled_df = load_prepared()
X = X_scaled_df.values
cols = X_scaled_df.columns.tolist()

In [None]:
# PCA for visualization coordinates
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)
print("PCA explained variance (first 2):", pca.explained_variance_ratio_)

In [None]:
# -------------- VISUAL: PCA scatter (no clusters) --------------
plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], s=10, alpha=0.6)
plt.title("PCA (2D) - raw (no clusters)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig(OUT_DIR / "pca_raw.png", dpi=150)
plt.close()

In [None]:
# -------------- K-MEANS: Elbow + Silhouette sweep --------------
Ks = list(range(2, 11))
inertia = []
sil_scores = []

In [None]:
for k in Ks:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    inertia.append(km.inertia_)
    sil_scores.append(silhouette_score(X, labels))

In [None]:
# Plot elbow & silhouette
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(Ks, inertia, marker='o')
plt.title("KMeans Elbow (Inertia)")
plt.xlabel("k"); plt.ylabel("Inertia")
plt.subplot(1,2,2)
plt.plot(Ks, sil_scores, marker='o')
plt.title("KMeans Silhouette vs k")
plt.xlabel("k"); plt.ylabel("Silhouette score")
plt.tight_layout()
plt.savefig(OUT_DIR / "kmeans_elbow_silhouette.png", dpi=150)
plt.close()

In [None]:
best_k = Ks[int(np.argmax(sil_scores))]
print("KMeans: best k by silhouette in range 2-10:", best_k, "silhouette:", max(sil_scores))

In [None]:
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=30)
km_labels = kmeans.fit_predict(X)

In [None]:
# PCA plot colored by KMeans
plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=km_labels, cmap='tab10', s=12, alpha=0.7)
plt.title(f"KMeans (k={best_k}) on PCA(2)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.savefig(OUT_DIR / f"kmeans_k{best_k}_pca.png", dpi=150)
plt.close()

In [None]:
# Save KMeans results
pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'kmeans_label': km_labels}).to_csv(OUT_DIR / f"kmeans_k{best_k}_pca_labels.csv", index=False)

In [None]:
print("KMeans cluster counts:\n", pd.Series(km_labels).value_counts())
print("KMeans silhouette:", silhouette_score(X, km_labels))

In [None]:
# -------------- Hierarchical (Agglomerative) --------------
# Dendrogram (on subset for readability)
sample_n = min(300, X.shape[0])
sample_idx = np.random.choice(X.shape[0], size=sample_n, replace=False)
Z = linkage(X[sample_idx], method='ward')

In [None]:
plt.figure(figsize=(10, 4))
dendrogram(Z, truncate_mode='level', p=5)
plt.title("Dendrogram (ward) - truncated")
plt.tight_layout()
plt.savefig(OUT_DIR / "dendrogram_ward_truncated.png", dpi=150)
plt.close()

In [None]:
linkages = ['ward', 'complete', 'average']
agg_results = {}
for link in linkages:
    # ward linkage requires 'euclidean' and can't be used if metric != euclidean (we have default)
    ac = AgglomerativeClustering(n_clusters=best_k, linkage=link)
    labels_ac = ac.fit_predict(X)
    s = silhouette_score(X, labels_ac)
    agg_results[link] = (labels_ac, s)
    # plot
    plt.figure(figsize=(7,5))
    plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_ac, cmap='tab10', s=12, alpha=0.7)
    plt.title(f"Agglomerative ({link}) k={best_k} silhouette={s:.3f}")
    plt.xlabel("PC1"); plt.ylabel("PC2")
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"agg_{link}_k{best_k}_pca.png", dpi=150)
    plt.close()
    print(f"Agglomerative ({link}) silhouette: {s:.4f} counts:\n", pd.Series(labels_ac).value_counts())

In [None]:
# Save last agg's labels
for link in linkages:
    labels_ac, s = agg_results[link]
    pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], f'agg_{link}_label': labels_ac}).to_csv(OUT_DIR / f"agg_{link}_k{best_k}_labels.csv", index=False)

In [None]:
# -------------- DBSCAN: choose eps by k-NN knee + sweep --------------
# compute 5-NN distances sorted to inspect knee
nbrs = NearestNeighbors(n_neighbors=5).fit(X)
distances, _ = nbrs.kneighbors(X)
kth_dist = np.sort(distances[:,4])
plt.figure(figsize=(6,4))
plt.plot(kth_dist)
plt.title("Sorted 5-NN distances (knee indicates eps)")
plt.ylabel("5-NN distance")
plt.xlabel("sorted points")
plt.tight_layout()
plt.savefig(OUT_DIR / "knn_5dist_sorted.png", dpi=150)
plt.close()

In [None]:
# Sweep eps and min_samples
eps_list = [0.3, 0.5, 0.7, 0.9, 1.1]
min_samples_list = [4, 6, 8]
best_db = None
best_db_score = -1
for eps in eps_list:
    for ms in min_samples_list:
        db = DBSCAN(eps=eps, min_samples=ms)
        labels_db = db.fit_predict(X)
        n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
        if n_clusters <= 1:
            score = -1
        else:
            mask = labels_db != -1
            try:
                score = silhouette_score(X[mask], labels_db[mask])
            except:
                score = -1
        print(f"DBSCAN eps={eps}, min_samples={ms} -> clusters={n_clusters}, silhouette={score:.4f}, noise={(labels_db==-1).sum()}")
        if score > best_db_score:
            best_db_score = score
            best_db = (eps, ms, labels_db)

In [None]:
if best_db is not None:
    eps, ms, labels_db = best_db
    print("Best DBSCAN:", eps, ms, "silhouette:", best_db_score)
    # save and plot
    plt.figure(figsize=(7,5))
    plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_db, cmap='tab10', s=12, alpha=0.7)
    plt.title(f"DBSCAN (eps={eps}, min_samples={ms})")
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"dbscan_eps{eps}_ms{ms}_pca.png", dpi=150)
    plt.close()
    pd.DataFrame({'PC1':X_pca[:,0],'PC2':X_pca[:,1],'dbscan_label':labels_db}).to_csv(OUT_DIR / f"dbscan_eps{eps}_ms{ms}_labels.csv", index=False)
else:
    print("DBSCAN did not find a stable multi-cluster solution in the tried grid.")

In [None]:
# -------------- CLUSTER INTERPRETATION: cluster means on original-scale features --------------
def summarize_clusters(original_df, labels, name):
    # original_df should be the raw (unscaled) numeric dataframe aligned to labels rows
    df_tmp = original_df.copy().reset_index(drop=True)
    df_tmp['cluster'] = labels
    summary = df_tmp.groupby('cluster').mean().T
    summary_file = OUT_DIR / f"{name}_cluster_feature_means.csv"
    summary.to_csv(summary_file)
    print(f"Saved cluster means for {name} to {summary_file}")
    return summary

In [None]:
# For kmeans:
summary_km = summarize_clusters(raw_num.reset_index(drop=True).loc[X_scaled_df.index], km_labels, f"kmeans_k{best_k}")
print("\nKMeans cluster means (truncated):")
print(summary_km.head())

In [None]:
# For hierarchical (ward)
ward_labels, ward_s = agg_results['ward']
summary_ward = summarize_clusters(raw_num.reset_index(drop=True).loc[X_scaled_df.index], ward_labels, "agg_ward")
print("\nAgglomerative(ward) cluster means (truncated):")
print(summary_ward.head())

In [None]:
if best_db is not None:
    summary_db = summarize_clusters(raw_num.reset_index(drop=True).loc[X_scaled_df.index], labels_db, f"dbscan_eps{eps}_ms{ms}")
    print("\nDBSCAN cluster means (truncated):")
    print(summary_db.head())

In [None]:
# -------------- FINAL SUMMARY PRINT --------------
print("\n--- FINAL SUMMARY ---")
print(f"KMeans k={best_k} silhouette={silhouette_score(X, km_labels):.4f}")
for link in linkages:
    print(f"Agglomerative ({link}) silhouette={agg_results[link][1]:.4f}")
if best_db is not None:
    print(f"Best DBSCAN eps={eps}, min_samples={ms} silhouette={best_db_score:.4f}")

In [None]:
print("All plots and CSV outputs saved to:", OUT_DIR)