In [41]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, ShuffleSplit
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
import os

df = pd.read_csv("data.csv")

X = df

cat_var = ['state', 'id', 'zipcode', 'city', "last_purchase"]
num_var = ['total', 'freq', 'no_purchase_period', 'recense', 'montant']

In [42]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

num_transformer = StandardScaler()

f_encoder = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_var),
        ('num', num_transformer, num_var)
    ]
)

encoder = ColumnTransformer(
    transformers=[
        #('cat', cat_transformer, cat_var),
        ('num', num_transformer, num_var)
    ]
)

pipeline = Pipeline([
    ('enc', encoder),
])

f_pipeline = Pipeline([
    ('enc', f_encoder),
])

In [43]:
best = {}
k = 5
model = KPrototypes(n_clusters=k, init="Cao", verbose=2, n_jobs=3, n_init=5, max_iter=50, random_state=42)

df_nor = df.copy()
df_nor[num_var] = pipeline.fit_transform(df_nor[num_var])

test = list(df.columns.get_indexer_for(cat_var))
clusters = model.fit(df, categorical=test)
best[k] = model.cost_

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing centroids
Init: initializing centroids
Init: initializing clusters
Init: initializing clusters
Init: initializing clusters
Starting iterations...
Init: initializing centroids
Starting iterations...
Init: initializing clusters
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/50, moves: 28769, ncost: 2866613199.644403
Run: 2, iteration: 1/50, moves: 28338, ncost: 2222439215.8615856
Run: 1, iteration: 1/50, moves: 29063, ncost: 2376834816.109366
Run: 3, iteration: 2/50, moves: 16366, ncost: 2422722393.755911
Run: 2, iteration: 2/50, moves: 12223, ncost: 2034067844.6901484
Run: 1, iteration: 2/50, moves: 16127, ncost: 2090057110.0359616
Run: 2, iteration: 3/50, moves: 5315, ncost: 1966699973.041371
Run: 3, iteration: 3/50, moves: 11282, ncost: 2186474034.8762355
Run: 1, iteration: 3/50, moves: 10059, ncost: 194076715

In [44]:
df["label"] = model.labels_

In [51]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

features = list(df.drop('label', axis=1).columns)

pca = PCA(n_components=2)
components = pca.fit_transform(f_pipeline.fit_transform(df.drop("label", axis=1)))

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

fig = px.scatter(components, x=0, y=1, color=df['label'])

for i, feature in enumerate(features):
    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=loadings[i, 0],
        y=loadings[i, 1],
        showarrow=True,
        arrowsize=2,
        arrowhead=2,
        xanchor="right",
        yanchor="top"
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
        yshift=5,
    )
fig.show()

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

In [53]:
f_pipeline.fit_transform(df)

<95407x201873 sparse matrix of type '<class 'numpy.float64'>'
	with 954070 stored elements in Compressed Sparse Row format>

In [59]:
f_pipeline.fit_transform(df)

AttributeError: to_dense not found