## Assignment-08 CLUSTERING ANALYSIS 

In [1]:
import pandas as pd
df = pd.read_excel("EastWestAirlines.xlsx", sheet_name=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   ID#                3999 non-null   int64
 1   Balance            3999 non-null   int64
 2   Qual_miles         3999 non-null   int64
 3   cc1_miles          3999 non-null   int64
 4   cc2_miles          3999 non-null   int64
 5   cc3_miles          3999 non-null   int64
 6   Bonus_miles        3999 non-null   int64
 7   Bonus_trans        3999 non-null   int64
 8   Flight_miles_12mo  3999 non-null   int64
 9   Flight_trans_12    3999 non-null   int64
 10  Days_since_enroll  3999 non-null   int64
 11  Award?             3999 non-null   int64
dtypes: int64(12)
memory usage: 375.0 KB


In [None]:
# Column rename
df.rename(columns={'ID#':'ID', 'Award?':'Award'}, inplace=True)

In [None]:
#Set ID as Index Column
df.set_index('ID',inplace=True)
df

## Data exploration 

#### Descriptive Statistics

In [None]:
df.describe().T

In [None]:
# different cc_miles have different max values.
# so, we want to check what values these columns can take

print('unique_cc1',df.cc1_miles.unique())
print('unique_cc2',df.cc2_miles.unique())
print('unique_cc3',df.cc3_miles.unique())

In [None]:
df.duplicated().sum()#duplicate value check

In [None]:
df[df.duplicated()]

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()#missing value check

## Explorator data analysis

In [None]:
import plotly.express as px

for column in df.columns:
    fig = px.histogram(df, x=column, title=f'{column} Distribution')
    fig.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a figure containing subplots
fig = plt.figure(figsize=(15, 10))

# Plot KDE for each feature
for i, column in enumerate(df.columns):
    ax = fig.add_subplot(3, 4, i + 1)
    sns.kdeplot(df[column], fill=True, ax=ax)
    ax.set_title(f'Kernel Density Estimation for {column}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Density')
plt.tight_layout()
plt.show()

In [None]:
ot=df.copy()
fig, axes=plt.subplots(10,1,figsize=(16,16),sharex=False,sharey=False)
sns.boxplot(x='Balance',data=ot,palette='crest',ax=axes[0])
sns.boxplot(x='Qual_miles',data=ot,palette='crest',ax=axes[1])
sns.boxplot(x='cc1_miles',data=ot,palette='crest',ax=axes[2])
sns.boxplot(x='cc2_miles',data=ot,palette='crest',ax=axes[3])
sns.boxplot(x='cc3_miles',data=ot,palette='crest',ax=axes[4])
sns.boxplot(x='Bonus_miles',data=ot,palette='crest',ax=axes[5])
sns.boxplot(x='Bonus_trans',data=ot,palette='crest',ax=axes[6])
sns.boxplot(x='Flight_miles_12mo',data=ot,palette='crest',ax=axes[7])
sns.boxplot(x='Flight_trans_12',data=ot,palette='crest',ax=axes[8])
sns.boxplot(x='Days_since_enroll',data=ot,palette='crest',ax=axes[9])
plt.tight_layout(pad=2.0)

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(data=df)

In [None]:
import numpy as np
plt.figure(figsize=(12,8))
sns.boxplot(data=np.sqrt(df))

## Data Visualization

In [None]:
countNotermdeposit = len(df[df.Award == 0])
counthavetermdeposit = len(df[df.Award == 1])
print("Percentage of Customers without an Award: {:.2f}%".format((countNotermdeposit / len(df.Award))*100))
print("Percentage of Customers with an Award: {:.2f}%".format((counthavetermdeposit / len(df.Award))*100))

In [None]:
sns.countplot(x='Award', data=df,
              order=df['Award'].value_counts().index)
plt.xticks(fontsize = 12)
plt.title('Whether the client has a Award or not ')

In [None]:
# Balance : Number of miles eligible for award travel

plt.figure(figsize = (5,5))
Balance = df[['Award','Balance']].sort_values('Balance', ascending = False)
ax = sns.barplot(x='Award', y='Balance', data= Balance)
ax.set(xlabel = 'Award', ylabel= 'Balance')
plt.xticks(rotation=90)
plt.show()

In [None]:
corr_matrix = df.corr()
corr_matrix["Balance"].sort_values(ascending=False)

In [None]:
#correlation heatmap
f,ax = plt.subplots(figsize=(9,6))
sns.heatmap(df.corr(), annot=True, linewidths =.5, fmt ='.1f',ax=ax)
plt.show()

In [None]:
# Plotting frequent flying bonuses vs. non-flight bonus transactions
plt.figure(figsize = (5,5))
sorted_data = df[['cc1_miles','Bonus_trans']].sort_values('Bonus_trans', ascending = False)
ax = sns.barplot(x='cc1_miles', y='Bonus_trans', data= sorted_data)
ax.set(xlabel = 'Miles earned with freq. flyer credit card', ylabel= 'Non-flight bonus transactions')
plt.xticks(rotation=90)
plt.show()

## Data Processing

In [None]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
std_df = standard_scaler.fit_transform(df)
std_df.shape

In [None]:
#normalizing the data using minmaxscaler for accurancy result comparision
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

minmax_df = minmax.fit_transform(df)
minmax_df.shape

## K-means clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(std_df)
    inertia.append(kmeans.inertia_)

# Plotting the elbow method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

In [None]:
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(std_df)

In [None]:
df['Cluster'] = kmeans.labels_

In [None]:
df.drop('Cluster', axis=1, inplace=True)

In [None]:
df.columns

In [None]:
 fig = px.scatter(df, x=df.columns[0], y=df.columns[1], color='Kmeans_Cluster',
                 title='K-means Clustering',
                 labels={df.columns[0]: 'Feature 1', df.columns[1]: 'Feature 2', 'Cluster': 'Cluster'},
                 color_continuous_scale='viridis')

# Show the plot
fig.show()

## Hierarchical Clustering Algorithm

In [None]:
from scipy.cluster.hierarchy import linkage
import scipy.cluster.hierarchy as sch

In [None]:
z = linkage(minmax_df, method = "complete", metric = "euclidean")

In [None]:
plt.figure(figsize=(15, 8));plt.title('Hierarchical Clustering Dendrogram');plt.xlabel('Index');plt.ylabel('Distance')
sch.dendrogram(z, leaf_rotation = 0,  leaf_font_size = 5 )
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

h_complete = AgglomerativeClustering(n_clusters=4, linkage='complete').fit(minmax_df)
labels = h_complete.labels_
print(labels)


In [None]:
cluster_labels = pd.Series(h_complete.labels_)

In [None]:
df['Hierar_Cluster'] = cluster_labels

In [None]:
df.head()

In [None]:
df.iloc[:, 0:].groupby(df.Hierar_Cluster).mean()

## DBSCAN

In [None]:
import numpy as np
from itertools import product
from itertools import product
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score as sil
from sklearn.datasets import make_blobs





In [None]:
!pip install scikit-learn
# Example dataset (replace this with your actual dataset)
data = np.random.rand(100, 2)  # Random dataset with 100 samples and 2 features
df = pd.DataFrame(data, columns=['Feature1', 'Feature2'])

# Standardizing the dataset
scaler = StandardScaler()
std_df = scaler.fit_transform(df)

# Defining DBSCAN hyperparameters
eps_values = np.arange(0.25, 3, 0.25)  # eps values to be investigated
min_samples = np.arange(3, 23)  # min_samples values to be investigated
DBSCAN_params = list(product(eps_values, min_samples))

no_of_clusters = []
sil_score = []

for p in DBSCAN_params:
    DBS_clustering = DBSCAN(eps=p[0], min_samples=p[1]).fit(std_df)
    labels = DBS_clustering.labels_

    # Count the number of clusters (excluding noise points labeled as -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    no_of_clusters.append(n_clusters)

    # Compute silhouette score only if there are at least 2 clusters
    if n_clusters > 1:
        sil_score.append(silhouette_score(std_df, labels))
    else:
        sil_score.append(-1)  # Assign -1 if clustering is not valid

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(DBSCAN_params, columns=['eps', 'min_samples'])
results_df['no_of_clusters'] = no_of_clusters
results_df['silhouette_score'] = sil_score

# Display the first few rows
print(results_df.head())
        

In [None]:
tmp = pd.DataFrame.from_records(DBSCAN_params, columns =['Eps', 'Min_samples'])
tmp['No_of_clusters'] = no_of_clusters

pivot_1 = pd.pivot_table(tmp, values='No_of_clusters', index='Min_samples', columns='Eps')

fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(pivot_1, annot=True,annot_kws={"size": 16}, cmap="YlGnBu", ax=ax)
ax.set_title('Number of clusters')
print('A heatplot below shows how many clusters were genreated by the algorithm for the respective parameters combinations.')
plt.show()

In [None]:
tmp = pd.DataFrame.from_records(DBSCAN_params, columns =['Eps', 'Min_samples'])
tmp['Sil_score'] = sil_score

pivot_1 = pd.pivot_table(tmp, values='Sil_score', index='Min_samples', columns='Eps')

fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={"size": 10}, cmap="YlGnBu", ax=ax)
plt.show()

In [None]:

import pandas as pd
import seaborn as sns

# Generate sample data (Replace this with your actual dataset)
data, _ = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)

# Standardizing the data
scaler = StandardScaler()
std_df = scaler.fit_transform(data)  # std_df is now defined

# Parameter ranges
epsilon = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75]
min_samples = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

sil_avg = []
max_value = [0, 0, 0, 0]  # Stores (epsilon, min_samples, n_clusters, best_sil_score)

for i in range(len(epsilon)):
    for j in range(len(min_samples)):
        # Run DBSCAN
        db = DBSCAN(min_samples=min_samples[j], eps=epsilon[i]).fit(std_df)
        labels = db.labels_

        # Number of clusters (excluding noise, which is labeled as -1)
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        # Compute silhouette score only if there is more than 1 cluster
        if n_clusters_ > 1:
            silhouette_avg = sil(std_df, labels)
            if silhouette_avg > max_value[3]:
                max_value = [epsilon[i], min_samples[j], n_clusters_, silhouette_avg]
            sil_avg.append(silhouette_avg)

# Print the best parameters and silhouette score
print("Best DBSCAN Parameters:")
print("epsilon =", max_value[0])
print("min_samples =", max_value[1])
print("number of clusters =", max_value[2])
print("average silhouette score = %.4f" % max_value[3])

      

In [None]:
dbscan = DBSCAN(eps=2.5, min_samples=21)
dbscan.fit(std_df)

In [None]:
dbscan.labels_

In [None]:
df['DBSCAN_labels'] = dbscan.labels_
df.head()

In [None]:
df.groupby('DBSCAN_labels').agg(['mean'])

In [None]:
# Plotting barplot using groupby method to get visualize how many row no. in each cluster

fig, ax = plt.subplots(figsize=(10, 6))
df.groupby('DBSCAN_labels').count()['Balance'].plot(kind='bar')
plt.ylabel('ID Counts')
plt.title('DBSCAN Clustering Standard Scaled Data',fontsize='large',fontweight='bold')
ax.set_xlabel('Clusters', fontsize='large', fontweight='bold')
ax.set_ylabel('ID counts', fontsize='large', fontweight='bold')
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.show()