In [4]:
!pip install yellowbrick
!pip install xlrd
!pip install autoviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
!pip install yellowbrick


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libararies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
#from yellowbrick.cluster import KElbowVisualizer
import plotly.graph_objects as go
import pandas_profiling as pp
from autoviz.AutoViz_Class import AutoViz_Class




## Get Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/clustering project/data.csv')

## Overview Data

In [None]:
df.info()

**Note**:
As seen from the above all the features in the data set are numerical, except for "CUST_ID" that is the categorical one, this feature can give nothing for clustering as it is only the customer's id, Therefore we will remove it from the data set.

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

**Note**: We can see that there aren't duplicated values but there are null values in two columns named "CREDIT_LIMIT" and "MINIMUM_PAYMENTS"





In [None]:
df.describe()

**Note**: As seen from the above the features in the data set have different scale and thus have to be normalized.

In [None]:
AV = AutoViz_Class()
df = AV.AutoViz(filename="", sep=',', dfte=df, header=0, verbose=1, lowess=False, 
                chart_format='svg', )

In [None]:
pp.ProfileReport(df)

**Conclusion from overview section:**


1.   we must impute missing data 
2.   Data need to be normalized
3.   we nedd to rename "ONEOFF_PURCHASES" column.
4.   we need to drop "CUST_ID" column



## Preporccessing Data

In [None]:
df = df.drop('CUST_ID', axis = 1)

In [None]:
df = df.rename(columns={'ONEOFF_PURCHASES':'ONE_OFF_PURCHASES'})

#### Impute missing data

In [None]:
sns.boxplot(df['MINIMUM_PAYMENTS'])

In [None]:
sns.distplot(df['MINIMUM_PAYMENTS'])

**Note**
- the data is skewed. There are several or large numbers of data points that act as outliers. Outliers data points will have a significant impact on the mean and hence, in such cases, it is not recommended to use the mean for replacing the missing values

- When the data is skewed, it is good to consider using the median value for replacing the missing values,
- so we will use median to fill nan values in MINIMUM_PAYMENTS column

In [None]:
df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].median(), inplace=True)

In [None]:
sns.distplot(df['CREDIT_LIMIT'])

In [None]:
sns.boxplot(df['CREDIT_LIMIT'])

In [None]:
df.dropna(subset=["CREDIT_LIMIT"], inplace=True)

### Scaling Data

In [None]:
def plt_hist(df):
    fig, ax = plt.subplots(4,4, figsize = (20,15))
    for feature, subplot in zip(df, ax.flatten()):
        h = sns.histplot(df[feature], bins = 50, ax = subplot,kde = True)
        h.axvline(x = df[feature].mean(),color = 'red')

    plt.tight_layout()

In [None]:
plt_hist(df)

**Note**: 
- almost all columns are right-skewed, so we need to scale our features
- all features of our data are like gaussians but have a lot of oultliers so they have some skeweing so that Power Transformer scaler is most suitable for our feautres

In [None]:
X_scaled = df.copy()
X_scaled[:] = PowerTransformer().fit_transform(X_scaled)
plt_hist(X_scaled)

## Dimentionality Reduction 

In [None]:
pca = PCA(n_components=17)
pca.fit(X_scaled)
variance = pca.explained_variance_ratio_

var = np.cumsum(np.round(variance,3)*100)
plt.figure(figsize=(12,6))
plt.ylabel('%variance Explained')
plt.xlabel('# of Features')
plt.title('PCA Analysis')
plt.ylim(0,100.5)

plt.plot(var)

In [None]:
X_pca = PCA(.95).fit_transform(X_scaled)
X_pca.shape

In [None]:
X_kpca = KernelPCA(9,kernel='rbf').fit_transform(X_scaled)

## **Clustering**

In [None]:
#ploting silhouette_plot aside with clustring results in TSNE

def silhouette_plot(model,range_n_clusters,X_clustering,X_TSNE,**kwargs):
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X_clustering) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = model(n_clusters,**kwargs)
        cluster_labels = clusterer.fit_predict(X_clustering)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X_clustering, cluster_labels)
        print(
            "For n_clusters =",
            n_clusters,
            "The average silhouette_score is :",
            silhouette_avg,
        )

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X_clustering, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(
                np.arange(y_lower, y_upper),
                0,
                ith_cluster_silhouette_values,
                facecolor=color,
                edgecolor=color,
                alpha=0.7,
            )

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
    #     colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)

        sns.scatterplot(
            x = X_TSNE[:, 0],
            y = X_TSNE[:, 1],
            hue=cluster_labels,
            palette=sns.color_palette("colorblind", len(set(cluster_labels))),
            alpha=0.3,
            legend="full",
            ax = ax2
        )
        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            f"Silhouette analysis for {model.__name__} clustering on sample data with n_clusters = %d"
            % n_clusters,
            fontsize=14,
            fontweight="bold",
        )

    plt.show()

In [None]:
X_TSNE = TSNE(n_components=2, perplexity=3,init='pca', method='barnes_hut', n_jobs=2, n_iter=10**4,random_state=0).fit_transform(X_kpca)

In [None]:
kpca_model = KMeans()
visualizer = KElbowVisualizer(kpca_model, k=(1,12))
visualizer.fit(X_kpca)
visualizer.show()

In [None]:
silhouette_plot(KMeans,range(4,7),X_kpca,X_TSNE)

In [None]:
km = KMeans(5)
km.fit(X_kpca)

In [None]:
clusters=pd.concat([df, pd.DataFrame({'CLUSTER':km.labels_})], axis=1)
clusters.head()

In [None]:
clusters.CLUSTER.value_counts().plot.pie(autopct='%1.0f%%', pctdistance=0.7, labeldistance=1.1)


In [None]:
columns = ['BALANCE','PURCHASES','ONE_OFF_PURCHASES','INSTALLMENTS_PURCHASES','CASH_ADVANCE','CREDIT_LIMIT','PAYMENTS','MINIMUM_PAYMENTS']
clusters_results = clusters.groupby('CLUSTER')[columns].sum().iloc[0:5]

In [None]:
layout = go.Layout(title="Purchace Behaviour of each Cluster", xaxis=dict(title="Clusters"))
figure = go.Figure(data=[go.Bar(name= column, x=np.arange(1,7), y=clusters_results[column].values) for column in clusters_results.columns],layout=layout)

figure.show()