#CLUSTERING ANALYSIS

In [1]:
from google.colab import files
uploaded = files.upload()

Saving EastWestAirlines.xlsx to EastWestAirlines.xlsx


In [33]:
import pandas as pd

# Load the dataset
file_path = 'EastWestAirlines.xlsx'
data = pd.read_excel(file_path, sheet_name=0)  # Load the first sheet

# Display the first few rows to understand its structure
print(data.head())
print(data.columns)


  East-West Airlines is trying to learn more about its customers.  Key issues are their  \
0  flying patterns, earning and use of frequent f...                                      
1  card.  The task is to identify customer segmen...                                      
2                                                NaN                                      
3                                                NaN                                      
4  Source: Based upon real business data; company...                                      

  Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4  
0        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN  
Index(['East-West Airlines is trying to learn more about its customers.  Key issues are their',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],
     

In [34]:
# Inspect the first few rows to identify how data should be processed
print(data.head(10))
print(data.columns)

# Example cleaning steps based on typical issues:
# Drop columns and rows with all NaN values
data = data.dropna(axis=1, how='all')
data = data.dropna(how='all')

# Rename columns if needed (adjust based on inspection of your dataset)
data.columns = ['ID', 'Number', 'Char1', 'Telcom', 'Unique_ID']  # Adjust based on your dataset

# Keep only the relevant columns
data = data[['Number', 'Char1', 'Telcom']]

# Convert columns to numeric and drop any rows with NaN values
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna()

# Check the data after processing
print(data.head())
print(data.shape)


  East-West Airlines is trying to learn more about its customers.  Key issues are their  \
0  flying patterns, earning and use of frequent f...                                      
1  card.  The task is to identify customer segmen...                                      
2                                                NaN                                      
3                                                NaN                                      
4  Source: Based upon real business data; company...                                      
5             (c) 2016 Galit Shmueli and Peter Bruce                                      
6                                                NaN                                      
7                                                NaN                                      
8                                         Field Name                                      
9                                                ID#                                      

In [35]:
# Print data types and non-null counts
print(data.info())

# Verify data content
print(data.describe())


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Number  0 non-null      float64
 1   Char1   0 non-null      float64
 2   Telcom  0 non-null      float64
dtypes: float64(3)
memory usage: 0.0 bytes
None
       Number  Char1  Telcom
count     0.0    0.0     0.0
mean      NaN    NaN     NaN
std       NaN    NaN     NaN
min       NaN    NaN     NaN
25%       NaN    NaN     NaN
50%       NaN    NaN     NaN
75%       NaN    NaN     NaN
max       NaN    NaN     NaN


In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
file_path = 'EastWestAirlines.xlsx'
data = pd.read_excel(file_path, sheet_name=0)  # Load the first sheet

# Inspect initial data
print(data.head())
print(data.columns)

# Drop fully empty columns and rows
data = data.dropna(axis=1, how='all')
data = data.dropna(how='all')

# Renaming columns if necessary
data.columns = ['ID', 'Number', 'Char1', 'Telcom', 'Unique_ID']  # Adjust based on your dataset

# Select only relevant columns for clustering
data = data[['Number', 'Char1', 'Telcom']]

# Convert to numeric and handle missing values
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna()

# Verify if the data is empty after preprocessing
if data.empty:
    print("Data is empty after preprocessing. Please check the data and preprocessing steps.")
else:
    # Feature Scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(data)

    # PCA for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # K-Means Clustering
    inertia = []
    K = range(1, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)

    plt.figure(figsize=(10, 7))
    plt.plot(K, inertia, 'bx-')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    plt.show()

    optimal_k = 4  # Update based on the elbow plot
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    kmeans_clusters = kmeans.fit_predict(X)

    # Hierarchical Clustering
    plt.figure(figsize=(10, 7))
    Z = linkage(X, 'ward')
    dendrogram(Z)
    plt.title('Dendrogram')
    plt.xlabel('Sample index')
    plt.ylabel('Distance')
    plt.show()

    optimal_k_hierarchical = 4  # Update based on the dendrogram
    hierarchical = AgglomerativeClustering(n_clusters=optimal_k_hierarchical, linkage='ward')
    hierarchical_clusters = hierarchical.fit_predict(X)

    # DBSCAN
    neigh = NearestNeighbors(n_neighbors=5)
    distances, indices = neigh.fit_transform(X), neigh.kneighbors(X)
    distances = np.sort(distances[:, 4], axis=0)
    plt.figure(figsize=(10, 7))
    plt.plot(distances)
    plt.title('K-distance Graph')
    plt.xlabel('Points sorted by distance')
    plt.ylabel('Distance')
    plt.show()

    optimal_eps = 0.5  # Update based on K-distance graph
    dbscan = DBSCAN(eps=optimal_eps, min_samples=5)
    dbscan_clusters = dbscan.fit_predict(X)

    # Visualization
    plt.figure(figsize=(14, 7))

    plt.subplot(1, 3, 1)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_clusters, cmap='viridis')
    plt.title('K-Means Clustering')

    plt.subplot(1, 3, 2)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_clusters, cmap='viridis')
    plt.title('Hierarchical Clustering')

    plt.subplot(1, 3, 3)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_clusters, cmap='viridis')
    plt.title('DBSCAN Clustering')

    plt.show()

    # Evaluation
    print(f"K-Means Silhouette Score: {silhouette_score(X, kmeans_clusters):.2f}")
    print(f"Hierarchical Silhouette Score: {silhouette_score(X, hierarchical_clusters):.2f}")
    if len(set(dbscan_clusters)) > 1:
        print(f"DBSCAN Silhouette Score: {silhouette_score(X, dbscan_clusters):.2f}")
    else:
        print("DBSCAN did not form any clusters.")

  East-West Airlines is trying to learn more about its customers.  Key issues are their  \
0  flying patterns, earning and use of frequent f...                                      
1  card.  The task is to identify customer segmen...                                      
2                                                NaN                                      
3                                                NaN                                      
4  Source: Based upon real business data; company...                                      

  Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4  
0        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN  
Index(['East-West Airlines is trying to learn more about its customers.  Key issues are their',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],
     