In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. <a href="#0">Read the dataset</a>
2. <a href="#2">Data investigation</a>
3. <a href="#3">Data preprocessing </a>
4. <a href="#4">Features transformation </a>
5. <a href="#5">PCA Vs kernal PCA</a>
6. <a href="#6">K means</a>
7. <a href="#7">Hierarchical Clustering</a>
8. <a href="#8">dbscan</a>
9. <a href="#9">IsolationForest</a>
10. <a href="#10">GMM</a>
11. <a href="#11">Comparison</a>
12. <a href="#12">chosen algo with t-sne</a>






In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.manifold import TSNE
import plotly.express as px
import seaborn as sns



warnings.filterwarnings("ignore")

## 1. <a name="1">Read the dataset</a>
(<a href="#0">Go to top</a>)



In [None]:
#read the data

df = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")

print('The shape of the dataset is:', df.shape)
df.head(5)

## 2. <a name="2">Data investigation</a>
(<a href="#0">Go to top</a>)

in this part you need to check the data quality and assess any issues in the data as:
- null values in each column 
- each column has the proper data type
- outliers
- duplicate rows
- distribution for each column (skewness)
<br>

**comment each issue you find** 

In [None]:
# Let's see the data types and non-null values for each column
df.info()

In [None]:
round(df.isnull().sum(axis=0)*100/df.shape[0],2)

In [None]:
# This will print basic statistics for numerical columns
df.describe().T

In [None]:
df.duplicated().sum()

In [None]:

numerical_features=[feature for feature in df.columns if df[feature].dtypes!='object']

df[numerical_features].hist(bins=15, figsize=(20, 20), layout=(6, 3));


In [None]:
plt.subplots(figsize=(20,20))

mask = np.triu(np.ones_like(df.corr()))
heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='Greens')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':21}, pad=16);

## 3. <a name="3">Data preprocessing</a>
(<a href="#0">Go to top</a>)


### Define below all the issues that you had found in the previous part
1-CUST_ID  is categorical datatype (suitable to be the index)<br>
2- null values          <br>
3- Normalize numerical values          <br>

In [None]:
#make a copy for the original dataset
df_copy=df.copy()

### for each issue adapt this methodology 
- start by defining the solution
- apply this solution onn the data
- test the solution to make sure that you have solved the issue

**First issue**
CUST_ID is categorical datatype,so it is suitable to be the index 

In [None]:
df_copy.set_index('CUST_ID', inplace=True)


**Second issue** :fill the null values with the means

In [None]:
#solution 
df_copy["CREDIT_LIMIT"].fillna(df_copy["CREDIT_LIMIT"].mean(), inplace=True)
df_copy["MINIMUM_PAYMENTS"].fillna( df_copy["MINIMUM_PAYMENTS"].mean(), inplace=True)

In [None]:
#test 
round(df_copy.isnull().sum(axis=0)*100/df_copy.shape[0],2)

## 4. <a name="4">Features transformation</a>
(<a href="#0">Go to top</a>)


for more details on different methods for scaling check these links
- https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
- https://www.analyticsvidhya.com/blog/2020/07/types-of-feature-transformation-and-scaling/

Answer here:

In [None]:
df_scaled = df_copy.copy()
col_names =df_scaled.columns
features = df_scaled[col_names]


### Standard Scaler


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_Standard_Scaler = df_copy.copy()

df_Standard_Scaler[col_names] = scaler.fit_transform(features.values)
df_Standard_Scaler.describe().T


### Robust Scaler

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

df_RobustScaler = df_copy.copy()
df_Standard_Scaler
df_RobustScaler[col_names] = scaler.fit_transform(features.values)
df_RobustScaler.describe()

### MinMax Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
df_MinMaxScaler = df_copy.copy()

scaler = MinMaxScaler()
df_MinMaxScaler[col_names] = scaler.fit_transform(features.values)
df_MinMaxScaler.describe()


## 5. <a name="5">PCA Vs kernal PCA</a>
(<a href="#5">Go to top</a>)

In [None]:
pca = PCA()
#Transform the data
df_pca = pca.fit_transform(df_Standard_Scaler)
plt.plot(df_pca[0], df_pca[1])  # Plot the chart
plt.show()  

In [None]:
kernel_pca = KernelPCA(n_components=2,kernel='linear')
kernel_pca_df = kernel_pca.fit_transform(df_Standard_Scaler)
plt.plot(kernel_pca_df[0], kernel_pca_df[1])  # Plot the chart
plt.show()  

## 6. <a name="6">K Means</a>
(<a href="#5">Go to top</a>)

 3. Use elbow method


In [None]:
# 3. Use elbow method
inertia_list=[]

for i in range(1, 10):
    kmean_skl = KMeans(n_clusters=i, n_init=1,max_iter=200)
    kmean_skl.fit(df_Standard_Scaler)
    inertia_list.append(kmean_skl.inertia_)
    
plt.plot(range(1, 10), inertia_list, marker='o')
plt.xlabel('Num_of Clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
def kmean(df):
    kmean = KMeans(n_clusters=4, max_iter=100)
    kmean.fit(df)
    kmean.fit_predict(df)
    labels_kmean= kmean.labels_
    #df_pred=pd.DataFrame(pred,index=df_copy.index,columns= ['Model_label'])
    return labels_kmean

## 7. <a name="7">Hierarchical Clustering</a>
(<a href="#0">Go to top</a>)


In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

def cluster_Hierarchical(df):
    cluster_Hierarchical = AgglomerativeClustering(n_clusters=4, affinity='euclidean')
    cluster_Hierarchical.fit_predict(df)     
    labels_cluster_Hierarchical=cluster_Hierarchical.labels_
    return labels_cluster_Hierarchical
"""
#linkage='complete'
plt.figure(figsize=(10, 7))
plt.title("Counters Dendograms")
dend = shc.dendrogram(shc.linkage(y=df_pca , method='complete',metric='euclidean')) """

## 8. <a name="8">dbscan</a>
(<a href="#0">Go to top</a>)


In [None]:
from sklearn.cluster import DBSCAN
def dbscan(df):
    cluster_dbscan = DBSCAN(eps=8, min_samples=4).fit(df)
    n=cluster_dbscan.labels_
    return  cluster_dbscan.labels_


n=dbscan(df_RobustScaler)
n.shape
unique, counts = np.unique(n, return_counts=True)
dict(zip(unique, counts))
print('Silhoutte score of dbscan is ' , silhouette_score(df_RobustScaler,n))

In [None]:
from sklearn.cluster import DBSCAN
def dbscan(df):
    cluster_dbscan = DBSCAN(eps=8, min_samples=4).fit(df)
    return  cluster_dbscan.labels_

n=dbscan(df_RobustScaler)
n.shape
unique, counts = np.unique(n, return_counts=True)
print('Silhoutte score of dbscan is ' , silhouette_score(df_RobustScaler,n))
dict(zip(unique, counts))


## 9. <a name="9">IsolationForest</a>
(<a href="#0">Go to top</a>)


In [None]:
from sklearn.ensemble import IsolationForest 
def IForest(df):
    lforest = IsolationForest().fit(df)
    lforest_labels = lforest.predict(df)
    return lforest_labels

## 10. <a name="10">EM</a>
(<a href="#0">Go to top</a>)


In [None]:
from sklearn import mixture
def gmm(df):
    gmm = mixture.GaussianMixture(n_components=3,covariance_type="full",max_iter = 100,init_params="random")
    gmm.fit(df)
    gmm_labels = gmm.predict(df)
    return(gmm_labels)


## 11. <a name="11">Comparison </a>
#### (K_means,EM,Hierarchical Clustering, IsolationForest,dbscan)
PCA Vs kernal PCA 

MinMaxScaler Vs RobustScaler Vs Standard_Scaler

using Silhoutte score  and davies bouldin score 

(<a href="#0">Go to top</a>)

#### First let's see the score of these algo with pca and different data transformation
* df_MinMaxScaler
* df_RobustScaler 
* df_Standard_Scaler

In [None]:
#1-MinMaxScaler with diff algo
df_pca = pca.fit_transform(df_MinMaxScaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("scores of pca with MinMaxScaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))

print('davies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))

#print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))

#2-RobustScaler with diff algo
df_pca = pca.fit_transform(df_RobustScaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("\nscores of pca with RobustScaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))
print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))


print('davies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))
print('davies bouldin score of dbscan is ' , davies_bouldin_score(df_pca,cluster_dbscan_labels))


#3-Standard_Scaler with diff algo
df_pca = pca.fit_transform(df_Standard_Scaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("\nscores of pca with Standard_Scaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))
print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))

print('davies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))
print('davies bouldin score of dbscan is ' , davies_bouldin_score(df_pca,cluster_dbscan_labels))




#### Second let's see the score of these algo with pca kernal  and different data transformation
* df_MinMaxScaler
* df_RobustScaler 
* df_Standard_Scaler

In [None]:
#1-MinMaxScaler with diff algo
df_pca = kernel_pca.fit_transform(df_MinMaxScaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("scores of kernel_pca with MinMaxScaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))

print('\ndavies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))
#print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))

#2-RobustScaler with diff algo
df_pca = kernel_pca.fit_transform(df_RobustScaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("\n\nscores of kernel_pca with RobustScaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))
print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))

print('\ndavies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))
print('davies bouldin score of dbscan is ' , davies_bouldin_score(df_pca,cluster_dbscan_labels))

#3-Standard_Scaler with diff algo
df_pca = kernel_pca.fit_transform(df_Standard_Scaler)
labels_kmean=kmean(df_pca)
labels_cluster_Hierarchical=cluster_Hierarchical(df_pca)
gmm_labels=gmm(df_pca)
cluster_dbscan_labels=dbscan(df_pca)
lforest_labels=IForest(df_pca)
print("\n\nscores of kernel_pca with Standard_Scaler")
print('Silhoutte score of kmean is ' , silhouette_score(df_pca, labels_kmean))
print('Silhoutte score of Hierarchical is ' , silhouette_score(df_pca, labels_cluster_Hierarchical))
print('Silhoutte score of EM is ' , silhouette_score(df_pca,gmm_labels))
print('Silhoutte score of IsolationForest is ' , silhouette_score(df_pca,lforest_labels))
print('Silhoutte score of dbscan is ' , silhouette_score(df_pca,cluster_dbscan_labels))

print('\ndavies bouldin score of kmean is ' , davies_bouldin_score(df_pca, labels_kmean))
print('davies bouldin score of Hierarchical is ' , davies_bouldin_score(df_pca, labels_cluster_Hierarchical))
print('davies bouldin score of EM is ' , davies_bouldin_score(df_pca, gmm_labels))
print('davies bouldin score of IsolationForest is ' , davies_bouldin_score(df_pca, lforest_labels))
print('davies bouldin score of dbscan is ' , davies_bouldin_score(df_pca,cluster_dbscan_labels))





> These result shows that the  **kernel pca**  is better as it enables dealing with more complex data patterns, which would not be visible under linear transformations alone.

>These result shows also that the **RobustScaler** is better for this data 

>The **Silhouette** score used to study the separation distance between the resulting clusters :**kmean with kernel_pca and RobustScaler shows best score  0.73**.( higher values indicating better clustering)

>the **Davies-Bouldin** score is defined as the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters which are farther apart and less dispersed will result in a better score and **Hierarchical Clustering with kernel_pca and RobustScaler** shows best score **0.67**  then the K_mens .( lower values indicating better clustering)



### For the  anomaly detection algorithm
>dbscan shoes the pest result as :
* dbscan with kernel_pca and Standard_Scaler get davies bouldin score of dbscan = 0.08825825697115738
* dbscan with kernel_pca and RobustScaler get Silhoutte score = 0.929918036046784

## 11. <a name="11">chosen algo with t-sne</a>
(<a href="#0">Go to top</a>)
>The algorithms with the best score shown in the last section I will visualize it using t-sne


### 1- kmean with   RobustScaler 

In [None]:
tsne = TSNE(n_components=2).fit_transform(df_RobustScaler)
labels_kmean=kmean(tsne)
df_RobustScaler["cluster"]=labels_kmean.astype(str)

for c in df_RobustScaler:
    grid= sns.FacetGrid(df_RobustScaler, col='cluster')
    grid.map(plt.hist, c)

In [None]:
cl = []
for row in df_RobustScaler['cluster']:
    if row == "0" :cl.append("People mostly doesn't pay by Cash in Advance mostly Purchase in installment and Purchase Frequently")
    elif row =="1":cl.append("People  with high Purchase Frequently and use all types of payments ")
    elif row =="2":cl.append("People mostly pay by Cash in Advance with high palance and less Purchase Frequently")
    elif row =="3":cl.append("People with less Purchases and mostly doesn't Purchase in installment")




sns.scatterplot(tsne[:,0], tsne[:,1] , hue = cl,s=10,palette="Set2")

### 2- Hierarchical Clustering with   RobustScaler 

In [None]:
#df = df_RobustScaler.to_numpy()
labels_cluster_Hierarchical=cluster_Hierarchical(tsne)
sns.scatterplot(tsne[:,0], tsne[:,1] , c = labels_cluster_Hierarchical,s=10,palette="Set2")

### 3- dbscan with   RobustScaler

In [None]:
cluster_dbscan_labels=dbscan(tsne)
plt.scatter(tsne[:,0], tsne[:,1] , c = cluster_dbscan_labels,s=100)
#sns.scatterplot(tsne[:,0], tsne[:,1] , c = cluster_dbscan_labels,s=10,palette="Set2")

### 3- dbscan with   and Standard_Scaler

In [None]:
tsne_ss = TSNE(n_components=2).fit_transform(df_Standard_Scaler)
cluster_dbscan_labels=dbscan(tsne_ss)
plt.scatter(tsne_ss[:,0], tsne_ss[:,1] , c = cluster_dbscan_labels,s=10 )

>#### The last 2 dbscan graph shows that dbscan with RobustScaler doesn't get any anomaly as the RobustScaler  robust to outliers that's why it get high score in Silhouette score, but dbscan with and Standard Scaler shows anomaly