 ## Import Data and Perform Basic Data Exploration

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.cluster.hierarchy import dendrogram, linkage

In [13]:
# Load the dataset
df = pd.read_csv(r"C:\Users\LENOVO\Downloads\Credit_card_dataset.csv")

In [14]:

# Display the first few rows of the dataset
print(df.head())

# Get basic information about the dataset
print(df.info())

# Summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Check for duplicates
print(df.duplicated().sum())

  CUST_ID  BALANCE_FREQUENCY  PURCHASES     PAYMENTS  CREDIT_LIMIT  \
0  C10001           0.818182      95.40   201.802084        1000.0   
1  C10002           0.909091       0.00  4103.032597        7000.0   
2  C10003           1.000000     773.17   622.066742        7500.0   
3  C10004           0.636364    1499.00     0.000000        7500.0   
4  C10005           1.000000      16.00   678.334763        1200.0   

   CASH_ADVANCE  
0      0.000000  
1   6442.945483  
2      0.000000  
3    205.788017  
4      0.000000  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CUST_ID            8950 non-null   object 
 1   BALANCE_FREQUENCY  8950 non-null   float64
 2   PURCHASES          8950 non-null   float64
 3   PAYMENTS           8950 non-null   float64
 4   CREDIT_LIMIT       8949 non-null   float64
 5   CASH_ADVANCE       8950 

In [15]:
## Data Preparation

In [18]:
# Handle missing values (if any)
df.fillna(df.median(), inplace=True)

TypeError: Cannot convert [['C10001' 'C10002' 'C10003' ... 'C19188' 'C19189' 'C19190']] to numeric

In [None]:
# Encode categorical variables (if any)
# For example, if 'CUST_ID' is categorical, we can drop it as it's just an identifier
df.drop('CUST_ID', axis=1, inplace=True)

# Handle outliers (optional, depending on the dataset)
# For example, using the IQR method to detect and handle outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Standardize the data (important for clustering)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
## Hierarchical Clustering

In [None]:
# Select the features for clustering
X = df[['PURCHASES', 'CREDIT_LIMIT']]

# Perform hierarchical clustering
linked = linkage(X, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

# Fit the hierarchical clustering model
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
df['Cluster'] = cluster.fit_predict(X)

# Plot the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PURCHASES', y='CREDIT_LIMIT', hue='Cluster', data=df, palette='viridis')
plt.title('Hierarchical Clustering')
plt.show()

In [None]:
## Partitional Clustering (K-Means)

In [None]:
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# Plot the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PURCHASES', y='CREDIT_LIMIT', hue='Cluster', data=df, palette='viridis')
plt.title('K-Means Clustering')
plt.show()

In [None]:
# Elbow Method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(10, 7))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Based on the Elbow Method, choose the best k value (e.g., k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# Plot the clusters with the best k value
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PURCHASES', y='CREDIT_LIMIT', hue='Cluster', data=df, palette='viridis')
plt.title('K-Means Clustering with Optimal K')
plt.show()

Interpret the Results
Hierarchical Clustering: The dendrogram helps us understand the structure of the data and how clusters are formed. The scatter plot shows how customers are grouped based on their purchasing behavior and credit limit.

K-Means Clustering: The Elbow Method helps us determine the optimal number of clusters. The scatter plot with the optimal k value shows the final grouping of customers.

Interpretation: The clusters can be interpreted based on the features used. For example, customers with high purchases and high credit limits might be in one cluster, while those with low purchases and low credit limits might be in another. This can help in targeted marketing strategies.