In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ccdata/CC GENERAL.csv


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
from pandas_profiling import ProfileReport

ProfileReport(df).to_notebook_iframe()

In [None]:
# Dowload the report
from pandas_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("report.html")

In [None]:
# Delete the irrelevant features
df.drop(columns=['CUST_ID', 'TENURE'], inplace=True)

In [None]:
# See Null Values
df.isnull().sum().sort_values(ascending=False)

In [None]:
from sklearn.impute import KNNImputer

# Columns with missing values
null_columns = df.columns[df.isnull().any()].tolist()

# Fill Null values
imputer = KNNImputer(n_neighbors=5)
df_imp = pd.DataFrame(imputer.fit_transform(df[null_columns]), columns=null_columns)
df = df.fillna(df_imp)

In [None]:
# Distribution Visualization
plt.figure(figsize=(20,35))

for i, col in enumerate(df.columns):
    ax = plt.subplot(9, 2, i+1)
    sns.kdeplot(df[col], ax=ax)
    plt.xlabel(col)
        
plt.show()

In [None]:
# Feature Scaling
from sklearn.preprocessing import Normalizer

scaler = Normalizer(norm='l2')

In [None]:
# Reduce dimensions
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=1234)

In [None]:
# Preprocessing
from sklearn.pipeline import Pipeline

# Pipeline
pipe_normalizedscaled_pca = Pipeline([('scaling', scaler), ('pca', pca)])

In [None]:
# Transformed dataset
pd.DataFrame(
    pipe_normalizedscaled_pca.fit_transform(df),
    columns=['x', 'y']
)

In [None]:
# Libs
from sklearn.cluster import KMeans
import plotly.express as px
import seaborn as sns

# Function
def Visualize_Cluster(df, pipeline, n_clusters):
    '''
    Display a scatter plot cluster after transforming the data and using it to fit KMeans Cluster 
    
        Parameters:
                df (pandas.core.frame.DataFrame): Dataframe that will be used in the Pipeline and train the KMeans Cluster
                pipeline (sklearn.pipeline.Pipeline): Transform the Dataframe
                n_clusters (int): Number of clusters that the KMeans Cluster will have
        
        Returns:
                None    
    '''
    
    data = pd.DataFrame(pipeline.fit_transform(df), columns=['x', 'y'])
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, max_iter=300, verbose=False, random_state=1234)
    clusters = pd.DataFrame(kmeans.fit_predict(data), columns=['Cluster']) 
    
    clusters_data = pd.concat([data, clusters], axis=1)
    
    fig = px.scatter(clusters_data, x='x', y='y', color='Cluster')
    fig.show()

In [None]:
Visualize_Cluster(df, pipe_normalizedscaled_pca, n_clusters=5)

In [None]:
Visualize_Cluster(df, pipe_normalizedscaled_pca, n_clusters=10)

In [None]:
# Preprocess the data

# Feature Scaling
from sklearn.preprocessing import Normalizer
# Pipeline
from sklearn.pipeline import Pipeline
# Reduce dimensions
from sklearn.decomposition import PCA

pipe_normalizedscaled_pca = Pipeline([('scaling', Normalizer(norm='l2')), ('pca', PCA(n_components=2, random_state=1234))])
pipe_normalized = Pipeline([('scaling', Normalizer(norm='l2'))])

In [None]:
# Preprocess the data
# Feature Scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
# Pipeline
pipe_standardscaled_pca = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=2, random_state=1234))])
pipe_minmaxscaled_pca = Pipeline([('scaling', MinMaxScaler()), ('pca', PCA(n_components=2, random_state=1234))])
pipe_normalizedscaled_pca = Pipeline([('scaling', Normalizer()), ('pca', PCA(n_components=2, random_state=1234))])
pipe_normalized = Pipeline([('scaling', Normalizer())])

def Vizualize_Cluster(df, pipeline):
    '''
    Display a scatter plot cluster after transforming the data and using it to fit Kmeans Cluster 
    
        Parameters:
                df (pandas.core.frame.DataFrame): Dataframe that will be used in the Pipeline and train the KMeans cluster
                pipeline (sklearn.pipeline.Pipeline): Transform the Dataframe
        
        Returns:
                None    
    '''
    
    data = pd.DataFrame(pipeline.fit_transform(df), columns=['x', 'y'])
    
    kmeans = KMeans(n_clusters=6, n_init=10, max_iter=300, verbose=False, random_state=1234)
    clusters = pd.DataFrame(kmeans.fit_predict(data), columns=['Cluster']) 
    
    clusters_data = pd.concat([data, clusters], axis=1)
    
    fig = px.scatter(clusters_data, x='x', y='y', color='Cluster')
    fig.show()
    
    metrics(data, clusters)
    

def metrics(data, labels):
    print('Silhouette: ', silhouette_score(data, labels, metric='euclidean'))
    print('Davies Bouldin: ', davies_bouldin_score(data, labels))
    print('Calinski Harabasz: ', calinski_harabasz_score(data, labels))
    print('\n')
    
    
# print('StandardScaled and PCA: ')
# Vizualize_Cluster(df, pipe_standardscaled_pca)
#print('MinMaxScaled and PCA: ')
#Vizualize_Cluster(df, pipe_minmaxscaled_pca)
#print('Normalized and PCA: ')
#Vizualize_Cluster(df, pipe_normalizedscaled_pca)
print('Normalized: ')
Vizualize_Cluster(df, pipe_normalized)

# Achar numero ideal de clusters

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

pipe_normalizer = Pipeline([('normalization', Normalizer())])

In [None]:
data = pipe_normalizer.fit_transform(df)
data = pd.DataFrame(data, columns=df.columns)

def cluster_algorithm(n_clusters, dataset):
    kmean = KMeans(n_clusters=n_clusters, random_state=1234)
    labels = kmean.fit_predict(dataset)
    
    si = silhouette_score(dataset, labels, metric='euclidean', random_state=1234)
    db = davies_bouldin_score(dataset, labels)
    ch = calinski_harabasz_score(dataset, labels)
    
    return si, db, ch


n_clusters, si_list, db_list, ch_list = list(), list(), list(), list()

for i in range(2, 101):
    si, db, ch = cluster_algorithm(i, data)
    n_clusters.append(int(i))
    si_list.append(si)
    db_list.append(db)
    ch_list.append(ch)

metrics = pd.DataFrame(np.column_stack((n_clusters, si_list, db_list, ch_list)), columns=['N° Clusters', 'silhouette', 'davies_bouldin', 'calinski_harabasz'])

In [None]:
import plotly.express as px
# Silhouette
fig = px.line(metrics, x='N° Clusters', y='silhouette', title='Silhouette')
fig.show()

In [None]:
fig = px.line(metrics, x='N° Clusters', y='davies_bouldin', title='Davies Bouldin')
fig.show()

In [None]:
fig = px.line(metrics, x='N° Clusters', y='calinski_harabasz', title='Calinski Harabasz')
fig.show()

In [None]:
metrics[metrics['N° Clusters'] <= 8][['N° Clusters', 'silhouette']].sort_values(by='silhouette', ascending=False)

metrics[metrics['N° Clusters'] <= 8][['N° Clusters', 'davies_bouldin']].sort_values(by='davies_bouldin', ascending=True)

metrics[metrics['N° Clusters'] <= 8][['N° Clusters', 'calinski_harabasz']].sort_values(by='calinski_harabasz', ascending=False)

# n_cluster = 5
metrics[metrics['N° Clusters'] == 5].reset_index(drop=True)

In [None]:
dataset = np.random.rand(8950, 16)

si, db, ch = cluster_algorithm(5, dataset)

pd.DataFrame(data=[[5, si, db, ch]], columns=['N° Clusters', 'silhouette', 'davies_bouldin', 'calinsk_harabasz'])

In [None]:
print('Metrics of the cluster with a normal dataset:')
display(metrics[metrics['N° Clusters'] == 5].reset_index(drop=True))
print('Metrics of the cluster with a random dataset:')
display(pd.DataFrame(data=[[5, si, db, ch]], columns=['N° Clusters', 'silhouette', 'davies_bouldin', 'calinsk_harabasz']))

In [None]:
splited_df = np.array_split(df, 5)

section, si_list, db_list, ch_list = list(), list(), list(), list()

for i in range(5):
    si, sb, ch = cluster_algorithm(5, splited_df[i])
    section.append(i+1)
    si_list.append(si)
    db_list.append(sb)
    ch_list.append(ch)


pd.DataFrame(np.column_stack([section, si_list, db_list, ch_list]), columns=['Section N°', 'silhouette', 'davies_bouldin', 'calinski_harabasz'])

In [None]:
# data = pd.DataFrame(pipe_normalizer.fit_transform(df), columns=df.columns)
    
kmeans = KMeans(n_clusters=5, n_init=10, max_iter=300, verbose=False, random_state=1234)
labels = pd.DataFrame(kmeans.fit_predict(data), columns=['CLUSTER']) 
    
clusters_data = pd.concat([df, labels], axis=1)
clusters_data

In [None]:
clusters_data['CLUSTER'].value_counts()

In [None]:
fig = px.scatter(clusters_data, x='PURCHASES', y='PAYMENTS', color='CLUSTER')
fig.show()

In [None]:
fig = px.scatter(clusters_data, x='PURCHASES', y='CASH_ADVANCE', color='CLUSTER')
fig.show()

In [None]:
sns.pairplot(clusters_data, hue='CLUSTER', palette='bright')

In [None]:
clusters_data.groupby(by='CLUSTER').describe()

In [None]:
centroids = kmeans.cluster_centers_
centroids

In [None]:
for i in range(centroids.shape[1]):
    print(f'{clusters_data.columns[i]}:', '{:.4f}'.format(centroids[:, i].var()))

In [None]:
centroids_list = list()

for i in range(centroids.shape[1]):
    centroids_list.append(centroids[:, i].var())


centroids_var = pd.DataFrame(centroids_list, index=clusters_data.columns[:-1], columns=['Variance of the Centroids'])
centroids_var

In [None]:
significant_centroids_var = centroids_var[centroids_var['Variance of the Centroids'] > 0.009]
significant_centroids_var

In [None]:
from scipy import stats

stats.probplot(clusters_data['BALANCE'], plot=plt)
plt.show()

The dataset is a usage of account during the last 6 months so it means that at the start of these 6 months, lets say the amount in account is VALUE
where: VALUE = PURCHASE + BALANCE
so PURCHASE is the amount of money the account spent
and the BALANCE is the money left in VALUE

In [None]:
balance = significant_centroids_var.index[0]

clusters_data[[balance, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: 

CLUSTER 1: sobrou uma boa quantidade para comprar

CLUSTER 2: maior valor que sobrou para comprar

CLUSTER 3: menor valor que sobrou para comprar

CLUSTER 4: sobrou uma quantidade mediana para comprar

In [None]:
purchases = significant_centroids_var.index[1]

clusters_data[[purchases, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: foco deles é comprar, tem muito dinheiro 

CLUSTER 1: compram uma boa quantidade e ainda sobra uma boa quantidade para comprar

CLUSTER 2: compram muito pouco, tem muito dinheiro

CLUSTER 3: compram uma quantidade considerável em compras, não tem muito dinheiro

CLUSTER 4: compram pouquíssimo, tem uma quantidade considerável de dinheiro

In [None]:
cash_advance = significant_centroids_var.index[2]

clusters_data[[cash_advance, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: quase não paga antecipadamente

CLUSTER 1: paga pouco antecipadamente

CLUSTER 2: bom valor que pagam antecipadamente

CLUSTER 3: quase não paga antecipadamente

CLUSTER 4: maior valor que pagam antecipadamente

In [None]:
credit_limit = significant_centroids_var.index[3]

clusters_data[[credit_limit, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: limite alto

CLUSTER 1: menor limite

CLUSTER 2: limite alto

CLUSTER 3: maior limite

CLUSTER 4: limite mediano

In [None]:
payments = significant_centroids_var.index[4]

clusters_data[[payments, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: pagam uma boa quantidade de dinheiro, o que mais tendem a pagar a fatura completa

CLUSTER 1: pagam uma quantidade considerável, porém quase não pagam o que gastam

CLUSTER 2: pagam pouco, quase não pagam o que gastam 

CLUSTER 3: o que menos paga, porém até que paga o que gasta

CLUSTER 4: o que mais pagam, tendem a pagar toda fatura

CLUSTER 0: foco deles é gastar, quase não paga antecipadamente porém os que mais pagam completa, limite alto: os melhores clientes

CLUSTER 1: sobra uma boa quantidade para comprar, pagam uma quantidade considerável quantidade, paga pouco antecipadamente, menor limite

CLUSTER 2: tem dinheiro mas não gasta, bom valor que pagam antecipadamente porem quase não pagam a fatura inteira, limite alto: tem mais caloteiros

CLUSTER 3: não tem muito dinheiro, porém gastam uma boa parte desse dinheiro que não é muito, quase não paga antecipadamente, maior limite, o que menos pagam porém até que paga o que gasta: os piores clientes

CLUSTER 4: possuem uma quantidade considerável de dinheiro porém gastam pouquíssimo, maior valor que pagam antecipadamente e são os melhores pagadores, limite mediano: clientes com alto potencial

In [None]:
clusters_data[['PRC_FULL_PAYMENT', 'CLUSTER']].groupby(by='CLUSTER').describe()

In [None]:
minimum_payments = significant_centroids_var.index[5]

clusters_data[[minimu_payments, 'CLUSTER']].groupby(by='CLUSTER').describe()

CLUSTER 0: pagamento mínimo baixo

CLUSTER 1: pagamento mínimo extremamente alto

CLUSTER 2: pagamento mínimo bom

CLUSTER 3: o menor pagamento mínimo

CLUSTER 4: pagamento mínimo mediano

In [None]:
clusters_data['CLUSTER'].value_counts()

http://benalexkeen.com/feature-scaling-with-scikit-learn/

https://www.quora.com/Which-one-is-better-before-clustering-standardization-or-normalization

https://stats.stackexchange.com/questions/183236/what-is-the-relation-between-k-means-clustering-and-pca

https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/

https://datascience.stackexchange.com/questions/76930/the-impact-of-using-different-scaling-strategy-with-clustering