# Trials notebook

First trials consumption.

<a name="installs"></a>
## Installs

In [None]:
!pip uninstall helpers -y

In [None]:
!pip install git+https://github.com/Xmaster6y/ML-Engineer@develop 

In [None]:
!pip install kneed

<a name="imports"></a>
## Imports

In [None]:
import os
import pandas as pd
import numpy as np
from numpy.random import default_rng
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from time import time
from copy import deepcopy

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
from sklearn.impute import KNNImputer

from sklearn.dummy import DummyRegressor

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, BisectingKMeans, Birch

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

from sklearn.metrics import r2_score, d2_absolute_error_score

In [None]:
import helpers

In [None]:
dir(helpers)

<a name="data-loading"></a>
## Data loading

In [None]:
cleaned_dataset_path = 'data_cleaned.csv'
if not os.path.exists(cleaned_dataset_path):
    !wget "https://drive.google.com/uc?export=download&id=14ehoYTN8BvRPCK-fOEc08CfBcms0gjFr" -q --show-progress -O "$cleaned_dataset_path"
!head -2 $cleaned_dataset_path

In [None]:
df_cleaned = pd.read_csv(cleaned_dataset_path)
df_cleaned.info()

## Initial comparison

In [None]:
algorithms_to_take = [
    "KMeans",
    "Bisecting\nKMeans",
    "Agglomerative\nClustering",
    "DBSCAN",
    "BIRCH",
]
datasets_to_take = [True, False, True, True, False, True]

helpers.plot.clustering.sklearn_comparison(
    algorithms_to_take=algorithms_to_take,
    datasets_to_take=datasets_to_take
)


## RFM

### Preprocessing

In [None]:
rfm_cols = [
    "recency",
    "frequency",
    "amount",
]
rfm_tr = Pipeline([
    ("scaler", StandardScaler())
    ])

rfm_prep = ColumnTransformer([
    ("rfm_tr", rfm_tr, rfm_cols),
    ])

In [None]:
X_rfm = rfm_prep.fit_transform(df_cleaned)

In [None]:
N = 8000
rng = default_rng(seed=42)
numbers = rng.choice(df_cleaned.shape[0], size=N, replace=False)
df_cleaned_sub = df_cleaned.loc[numbers].reset_index()
X_rfm_sub = rfm_prep.fit_transform(df_cleaned_sub)

### Agglomerative Clustering

#### Clustering

In [None]:
model = AgglomerativeClustering()
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rfm_sub)
visualizer.poof()

In [None]:
model = AgglomerativeClustering()
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rfm_sub)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 2
model = AgglomerativeClustering(n_clusters=N_C)
pipe = Pipeline([
    ("prep", rfm_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned_sub, pipe, N_C, rfm_cols)

In [None]:
N_C = 5
model = AgglomerativeClustering(n_clusters=N_C)
helpers.plot.clustering.cluster_analysis(X_rfm_sub, model, N_C, rfm_cols)

### KMeans

#### Clustering

In [None]:
n_cluster = range(2, 11, 1)
inertia_value = []


for i in n_cluster:
    model = KMeans(n_clusters=i, n_init='auto')
    model.fit(X_rfm)
    inertia_value.append(model.inertia_)
    
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1,1,1)
ax.plot(n_cluster, inertia_value, marker="o")

ax.set_xlabel("Number of Segments")
ax.set_ylabel("Inertia Value")

fig.tight_layout()

In [None]:
model = KMeans(n_init='auto')
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

In [None]:
model = KMeans(n_init='auto')
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

In [None]:
model = KMeans(4, n_init='auto')
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_rfm)  
visualizer.poof()    

#### Cluster Analysis

In [None]:
N_C = 4
model = KMeans(N_C, n_init='auto')
helpers.plot.clustering.cluster_analysis(X_rfm_sub, model, N_C, rfm_cols)

In [None]:
N_C = 3
model = KMeans(N_C, n_init='auto')
helpers.plot.clustering.cluster_analysis(X_rfm_sub, model, N_C, rfm_cols)

### Biscecting Kmeans

#### Clustering

In [None]:
model = BisectingKMeans()
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

In [None]:
model = BisectingKMeans()
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 3
model = BisectingKMeans(N_C)
pipe = Pipeline([
    ("prep", rfm_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_cols, frac=0.1)

In [None]:
N_C = 4
model = BisectingKMeans(N_C)
pipe = Pipeline([
    ("prep", rfm_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_cols, frac=0.1)

### Birch

#### Clustering

In [None]:
model = Birch()
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

In [None]:
model = Birch()
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rfm)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 3
model = Birch(n_clusters=N_C)
pipe = Pipeline([
    ("prep", rfm_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_cols, frac=0.1)

In [None]:
N_C = 4
model = Birch(n_clusters=N_C)
pipe = Pipeline([
    ("prep", rfm_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_cols, frac=0.1)

### DBSCAN

#### Clustering

In [None]:
# parameter tuning for eps
from sklearn.neighbors import NearestNeighbors
nearest_neighbors = NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(X_rfm_sub)
distances, indices = neighbors.kneighbors(X_rfm_sub)
distances = np.sort(distances[:,10], axis=0)

from kneed import KneeLocator
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
print(distances[knee.knee])

In [None]:
# parameter tuning for eps
from sklearn.neighbors import NearestNeighbors
nearest_neighbors = NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(X_rfm)
distances, indices = neighbors.kneighbors(X_rfm)
distances = np.sort(distances[:,10], axis=0)

from kneed import KneeLocator
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
print(distances[knee.knee])

In [None]:
min_samples = 20
model = DBSCAN(eps=0.05, min_samples=min_samples)
y = model.fit_predict(X_rfm)

In [None]:
N = 1000
rng = default_rng(seed=42)
numbers = rng.choice(X_rfm.shape[0], size=N, replace=False)

df_seg = pd.DataFrame(X_rfm[numbers], columns=rfm_cols)
df_seg["cluster"] = y[numbers]
df_seg.head()

In [None]:
df_seg["cluster"].value_counts()

In [None]:
df_seg.loc[df_seg["cluster"]>0,"cluster"] = 1

#### Cluster Analysis

In [None]:
sns.pairplot(df_seg, hue="cluster")

## Review

### Preprocessing

In [None]:
rev_cols = [
    "recency",
    "frequency",
    "amount",
    "least_satisfaction",
    ]

In [None]:
num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
rev_prep = ColumnTransformer([
    ("num", num_tr, rev_cols),
    ])

In [None]:
X_rev = rev_prep.fit_transform(df_cleaned)

### KMeans

#### Clustering

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rev)
visualizer.poof()

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rev)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 5
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

In [None]:
N_C = 4
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

In [None]:
N_C = 3
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

### Bisecting KMeans

#### Clustering

In [None]:
model = BisectingKMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rev)
visualizer.poof()

In [None]:
model = BisectingKMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rev)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 5
model = BisectingKMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

In [None]:
N_C = 4
model = BisectingKMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

In [None]:
N_C = 3
model = BisectingKMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rev_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rev_cols, frac=0.1)

## Review + delay

### Preprocessing

In [None]:
rd_cols = [
    "recency",
    "amount",
    "least_satisfaction",
    "delay"
    ]

In [None]:
num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
rd_prep = ColumnTransformer([
    ("num", num_tr, rd_cols),
    ])

In [None]:
X_rd = rd_prep.fit_transform(df_cleaned)

### KMeans

#### Clustering

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rd)
visualizer.poof()

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rd)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 4
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rd_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rd_cols, frac=0.1)

In [None]:
N_C = 3
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rd_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rd_cols, frac=0.1)

## MSDQ

### Preprocessing

In [None]:
rdq_cols = [
    "amount",
    "least_satisfaction",
    "delay",
    "quantity"
    ]

In [None]:
num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
rdq_prep = ColumnTransformer([
    ("num", num_tr, rdq_cols),
    ])

In [None]:
X_rdq = rdq_prep.fit_transform(df_cleaned)

### KMeans

#### Clustering

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rdq)
visualizer.poof()

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rdq)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 5
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rdq_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rdq_cols, frac=0.1)

In [None]:
N_C = 4
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rdq_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rdq_cols, frac=0.1)

## RFM - Localisation

### Preprocessing

In [None]:
rfm_loc_cat_cols = [
    "localisation"
]

rfm_loc_num_cols = [
    "recency",
    "frequency",
    "amount",
    ]

rfm_loc_cols = rfm_loc_num_cols + rfm_loc_cat_cols

In [None]:
num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder())
    ])
rfm_loc_prep = ColumnTransformer([
    ("num", num_tr, rfm_loc_num_cols),
    ("cat", cat_tr, rfm_loc_cat_cols),
    ])

In [None]:
X_rfm_loc = rfm_loc_prep.fit_transform(df_cleaned)

### KMeans

#### Clustering

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_rfm_loc)
visualizer.poof()

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_rfm_loc)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 3
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rfm_loc_prep),
    ("clustering", model)
])
np.random.seed(0)
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

In [None]:
N_C = 5
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rfm_loc_prep),
    ("clustering", model)
])
np.random.seed(0)
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

In [None]:
N_C = 4
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", rfm_loc_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, rfm_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

## MSDQ - Localisation

### Preprocessing

In [None]:
msdq_loc_cat_cols = [
    "localisation"
]

msdq_loc_num_cols = [
    "amount",
    "least_satisfaction",
    "delay",
    "quantity"
    ]

msdq_loc_cols = msdq_loc_num_cols + msdq_loc_cat_cols

In [None]:
num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder())
    ])
msdq_loc_prep = ColumnTransformer([
    ("num", num_tr, msdq_loc_num_cols),
    ("cat", cat_tr, msdq_loc_cat_cols),
    ])

In [None]:
X_msdq_loc = msdq_loc_prep.fit_transform(df_cleaned)

### KMeans

#### Clustering

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), timings=False)

visualizer.fit(X_msdq_loc)
visualizer.poof()

In [None]:
model = KMeans(n_init=3)
visualizer = KElbowVisualizer(model, k=(2,11), metric='calinski_harabasz', timings=False)

visualizer.fit(X_msdq_loc)
visualizer.poof()

#### Cluster Analysis

In [None]:
N_C = 5
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", msdq_loc_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, msdq_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

In [None]:
N_C = 4
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", msdq_loc_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, msdq_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

In [None]:
N_C = 3
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", msdq_loc_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, msdq_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")

In [None]:
N_C = 6
model = KMeans(n_clusters=N_C, n_init=3)
pipe = Pipeline([
    ("prep", msdq_loc_prep),
    ("clustering", model)
])
helpers.plot.clustering.cluster_analysis(df_cleaned, pipe, N_C, msdq_loc_num_cols, frac=0.1, cat_plot=True, cat_col="localisation")