In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0aStgWSO0E0E"
   },
   "source": [
    "# Cluster"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1eLEkw5O0ECa"
   },
   "source": [
    "## Objectives\n",
    "\n",
    "* Fit and evaluate a cluster model to group similar data\n",
    "* Understand the profile for each cluster\n",
    "\n",
    "\n",
    "## Inputs\n",
    "\n",
    "* outputs/datasets/collection/TelcoCustomerChurn.csv\n",
    "* Instructions on which variables to use for data cleaning and feature engineering. They are found in their respective notebooks.\n",
    "\n",
    "## Outputs\n",
    "\n",
    "* Cluster Pipeline\n",
    "* Train Set\n",
    "* Most important features to define a cluster plot\n",
    "* Clusters Profile Description\n",
    "* Cluster Silhouette\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9uWZXH9LwoQg"
   },
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Change working directory"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need to change the working directory from its current folder to its parent folder\n",
    "* We access the current directory with os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "current_dir = os.getcwd()\n",
    "current_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We want to make the parent of the parent of current directory the new current directory\n",
    "* os.path.dirname() gets the parent directory\n",
    "* os.chir() defines the new current directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(os.path.dirname(current_dir))\n",
    "print(\"You set a new current directory\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Confirm the new current directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "current_dir = os.getcwd()\n",
    "current_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pXKlJFX0iuM5"
   },
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-mavJ8DibrcQ"
   },
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Xk7DU_ekbtX8"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = (pd.read_csv(\"outputs/datasets/collection/TelcoCustomerChurn.csv\")\n",
    "      .drop(['customerID', 'TotalCharges', 'Churn', 'tenure'], axis=1)\n",
    "      )\n",
    "print(df.shape)\n",
    "df.head(3)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "krjAk78Tbyhv"
   },
   "source": [
    "# Cluster Pipeline with all data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NZWZHhpYaDjf"
   },
   "source": [
    "##  ML Cluster Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "C6keis6ao8LA"
   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "# Feature Engineering\n",
    "from feature_engine.encoding import OrdinalEncoder\n",
    "from feature_engine.selection import SmartCorrelatedSelection\n",
    "\n",
    "# Feat Scaling\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# PCA\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# ML algorithm\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "\n",
    "def PipelineCluster():\n",
    "    pipeline_base = Pipeline([\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',\n",
    "                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',\n",
    "                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',\n",
    "                                                                'StreamingTV', 'StreamingMovies', 'Contract',\n",
    "                                                                'PaperlessBilling', 'PaymentMethod'])),\n",
    "\n",
    "        (\"SmartCorrelatedSelection\", SmartCorrelatedSelection(variables=None, method=\"spearman\",\n",
    "                                                              threshold=0.6, selection_method=\"variance\")),\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        (\"PCA\", PCA(n_components=50, random_state=0)),\n",
    "\n",
    "        (\"model\", KMeans(n_clusters=50, random_state=0)),\n",
    "\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Mrr31sD9DyvY"
   },
   "source": [
    "## Principal Component Analysis (PCA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "es49S65qqvRw"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster = PipelineCluster()\n",
    "pipeline_pca = Pipeline(pipeline_cluster.steps[:-2])\n",
    "df_pca = pipeline_pca.fit_transform(df)\n",
    "\n",
    "print(df_pca.shape,'\\n', type(df_pca))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WlABEj9Iw6Jr"
   },
   "source": [
    "Apply PCA separately to the scaled data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "cM_Xsqxsrt5M"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set_style(\"whitegrid\")\n",
    "\n",
    "n_components = 12\n",
    "\n",
    "\n",
    "def pca_components_analysis(df_pca, n_components):\n",
    "    pca = PCA(n_components=n_components).fit(df_pca)\n",
    "    x_PCA = pca.transform(df_pca)  # array with transformed PCA\n",
    "\n",
    "    ComponentsList = [\"Component \" + str(number)\n",
    "                      for number in range(n_components)]\n",
    "    dfExplVarRatio = pd.DataFrame(\n",
    "        data=np.round(100 * pca.explained_variance_ratio_, 3),\n",
    "        index=ComponentsList,\n",
    "        columns=['Explained Variance Ratio (%)'])\n",
    "\n",
    "    dfExplVarRatio['Accumulated Variance'] = dfExplVarRatio['Explained Variance Ratio (%)'].cumsum(\n",
    "    )\n",
    "\n",
    "    PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum(\n",
    "    )\n",
    "\n",
    "    print(\n",
    "        f\"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \\n\")\n",
    "    plt.figure(figsize=(9, 6))\n",
    "    sns.lineplot(data=dfExplVarRatio,  marker=\"o\")\n",
    "    plt.xticks(rotation=90)\n",
    "    plt.yticks(np.arange(0, 110, 10))\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "pca_components_analysis(df_pca=df_pca, n_components=n_components)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca_components_analysis(df_pca=df_pca,n_components=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PipelineCluster():\n",
    "    pipeline_base = Pipeline([\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',\n",
    "                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',\n",
    "                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',\n",
    "                                                                'StreamingTV', 'StreamingMovies', 'Contract',\n",
    "                                                                'PaperlessBilling', 'PaymentMethod'])),\n",
    "\n",
    "        (\"SmartCorrelatedSelection\", SmartCorrelatedSelection(variables=None, method=\"spearman\",\n",
    "                                                              threshold=0.6, selection_method=\"variance\")),\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        # we update n_components to 6\n",
    "        (\"PCA\", PCA(n_components=6, random_state=0)),\n",
    "\n",
    "        (\"model\", KMeans(n_clusters=50, random_state=0)),\n",
    "\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n",
    "\n",
    "\n",
    "PipelineCluster()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Uw9NtDj4EtEJ"
   },
   "source": [
    "## Elbow Method and Silhouette Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "JVaMnb9vGyBw"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster = PipelineCluster()\n",
    "pipeline_analysis = Pipeline(pipeline_cluster.steps[:-1])\n",
    "df_analysis = pipeline_analysis.fit_transform(df)\n",
    "\n",
    "print(df_analysis.shape,'\\n', type(df_analysis))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "TZBcHjt7EwFT"
   },
   "outputs": [],
   "source": [
    "from yellowbrick.cluster import KElbowVisualizer\n",
    "\n",
    "visualizer = KElbowVisualizer(KMeans(random_state=0), k=(1,11)) # 11 is not inclusive, it will plot until 10\n",
    "visualizer.fit(df_analysis) \n",
    "visualizer.show() \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from yellowbrick.cluster import SilhouetteVisualizer\n",
    "\n",
    "# 6 is not inclusive, it will stop at 5\n",
    "n_cluster_start, n_cluster_stop = 2, 6\n",
    "\n",
    "print(\"=== Average Silhouette Score for different number of clusters ===\")\n",
    "visualizer = KElbowVisualizer(KMeans(random_state=0), k=(\n",
    "    n_cluster_start, n_cluster_stop), metric='silhouette')\n",
    "visualizer.fit(df_analysis)\n",
    "visualizer.show()\n",
    "plt.show()\n",
    "print(\"\\n\")\n",
    "\n",
    "\n",
    "for n_clusters in np.arange(start=n_cluster_start, stop=n_cluster_stop):\n",
    "\n",
    "    print(f\"=== Silhouette plot for {n_clusters} Clusters ===\")\n",
    "    visualizer = SilhouetteVisualizer(estimator=KMeans(n_clusters=n_clusters, random_state=0),\n",
    "                                      colors='yellowbrick')\n",
    "    visualizer.fit(df_analysis)\n",
    "    visualizer.show()\n",
    "    plt.show()\n",
    "    print(\"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PipelineCluster():\n",
    "    pipeline_base = Pipeline([\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',\n",
    "                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',\n",
    "                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',\n",
    "                                                                'StreamingTV', 'StreamingMovies', 'Contract',\n",
    "                                                                'PaperlessBilling', 'PaymentMethod'])),\n",
    "\n",
    "        (\"SmartCorrelatedSelection\", SmartCorrelatedSelection(variables=None, method=\"spearman\",\n",
    "                                                              threshold=0.6, selection_method=\"variance\")),\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        (\"PCA\", PCA(n_components=6, random_state=0)),\n",
    "\n",
    "        # we update n_clusters to 3\n",
    "        (\"model\", KMeans(n_clusters=3, random_state=0)),\n",
    "\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n",
    "\n",
    "\n",
    "PipelineCluster()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "YQBjAlRsHhU4"
   },
   "source": [
    "## Fit Cluster Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kpxaylKk-6CQ"
   },
   "source": [
    "Quick recap of our data for training cluster pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "zfKHc63v-6Zm"
   },
   "outputs": [],
   "source": [
    "X = df.copy()\n",
    "print(X.shape)\n",
    "X.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NfRpKC4Ykreg"
   },
   "source": [
    "Fit Cluster pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "MAiyUpTWHjQh"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster = PipelineCluster()\n",
    "pipeline_cluster.fit(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0L0iMkjJHXSI"
   },
   "source": [
    "## Add cluster predictions to dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ZKT5IjmTmei8"
   },
   "source": [
    "We add a column \"`Clusters`\" (with the cluster pipeline predictions) to the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ow8B0xVdmlgK"
   },
   "outputs": [],
   "source": [
    "X['Clusters'] = pipeline_cluster['model'].labels_\n",
    "print(X.shape)\n",
    "X.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "eAVrYJEqxYyG"
   },
   "outputs": [],
   "source": [
    "print(f\"* Clusters frequencies \\n{ X['Clusters'].value_counts(normalize=True).to_frame().round(2)} \\n\\n\")\n",
    "X['Clusters'].value_counts().sort_values().plot(kind='bar')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "sns.set_style(\"whitegrid\")\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.scatterplot(x=df_analysis[:, 0], y=df_analysis[:, 1],\n",
    "                hue=X['Clusters'], palette='Set1', alpha=0.6)\n",
    "plt.scatter(x=pipeline_cluster['model'].cluster_centers_[:, 0], y=pipeline_cluster['model'].cluster_centers_[:, 1],\n",
    "            marker=\"x\", s=169, linewidths=3, color=\"black\")\n",
    "plt.xlabel(\"PCA Component 0\")\n",
    "plt.ylabel(\"PCA Component 1\")\n",
    "plt.title(\"PCA Components colored by Clusters\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MnjHhYjXng2r"
   },
   "source": [
    "We save the cluster predictions from this pipeline to use in the future. We will get back to that in a later stage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "FWgb0kPOWtMa"
   },
   "outputs": [],
   "source": [
    "cluster_predictions_with_all_variables = X['Clusters']\n",
    "cluster_predictions_with_all_variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "sTWTf1rgkQ7b"
   },
   "source": [
    "## Fit a classifier, where the target is cluster predictions and features remaining variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "hP6sGUn0XyDm"
   },
   "source": [
    "We copy `X` to a DataFrame `df_clf`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "OeLq81sm2yAg"
   },
   "outputs": [],
   "source": [
    "df_clf = X.copy()\n",
    "print(df_clf.shape)\n",
    "df_clf.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4b3Ei6Os5X3s"
   },
   "source": [
    "Split Train and Test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "cgHXehCVyzUl"
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    df_clf.drop(['Clusters'], axis=1),\n",
    "    df_clf['Clusters'],\n",
    "    test_size=0.2,\n",
    "    random_state=0\n",
    ")\n",
    "\n",
    "print(X_train.shape, X_test.shape)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6EZUk-uV5aN8"
   },
   "source": [
    "Create classifier pipeline steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feat Selection\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "\n",
    "# ML algorithm\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "\n",
    "def PipelineClf2ExplainClusters():\n",
    "    pipeline_base = Pipeline([\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',\n",
    "                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',\n",
    "                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',\n",
    "                                                                'StreamingTV', 'StreamingMovies', 'Contract',\n",
    "                                                                'PaperlessBilling', 'PaymentMethod'])),\n",
    "\n",
    "        (\"SmartCorrelatedSelection\", SmartCorrelatedSelection(variables=None, method=\"spearman\",\n",
    "                                                              threshold=0.6, selection_method=\"variance\")),\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        (\"feat_selection\", SelectFromModel(\n",
    "            GradientBoostingClassifier(random_state=0))),\n",
    "\n",
    "        (\"model\", GradientBoostingClassifier(random_state=0)),\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n",
    "\n",
    "\n",
    "PipelineClf2ExplainClusters()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the classifier to the training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3R7xdg1Av0Ce"
   },
   "outputs": [],
   "source": [
    "pipeline_clf_cluster = PipelineClf2ExplainClusters()\n",
    "pipeline_clf_cluster.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "z05LMFoZ4T2K"
   },
   "source": [
    "## Evaluate classifier performance on Train and Test Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "M1iqL2Kc544K"
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_train, pipeline_clf_cluster.predict(X_train)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0Oo4xJMZ615p"
   },
   "outputs": [],
   "source": [
    "print(classification_report(y_test, pipeline_clf_cluster.predict(X_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MEwjHBSh5ejG"
   },
   "source": [
    "## Assess the most important Features that define a cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "BG5ztHxsKcd5"
   },
   "outputs": [],
   "source": [
    "# after data cleaning and feature engineering, the feature space changes\n",
    "\n",
    "# how many data cleaning and feature engineering steps does your pipeline have?\n",
    "data_cleaning_feat_eng_steps = 2\n",
    "columns_after_data_cleaning_feat_eng = (Pipeline(pipeline_clf_cluster.steps[:data_cleaning_feat_eng_steps])\n",
    "                                        .transform(X_train)\n",
    "                                        .columns)\n",
    "\n",
    "best_features = columns_after_data_cleaning_feat_eng[pipeline_clf_cluster['feat_selection'].get_support(\n",
    ")].to_list()\n",
    "\n",
    "# create DataFrame to display feature importance\n",
    "df_feature_importance = (pd.DataFrame(data={\n",
    "    'Feature': columns_after_data_cleaning_feat_eng[pipeline_clf_cluster['feat_selection'].get_support()],\n",
    "    'Importance': pipeline_clf_cluster['model'].feature_importances_})\n",
    "    .sort_values(by='Importance', ascending=False)\n",
    ")\n",
    "\n",
    "# reassign best features in importance order\n",
    "best_features = df_feature_importance['Feature'].to_list()\n",
    "\n",
    "# Most important features statement and plot\n",
    "print(f\"* These are the {len(best_features)} most important features in descending order. \"\n",
    "      f\"The model was trained on them: \\n{best_features} \\n\")\n",
    "df_feature_importance.plot(kind='bar', x='Feature', y='Importance')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "qgul0EF9nx_E"
   },
   "source": [
    "We will store the best_features to use at a later stage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "YzyMkwHznyG8"
   },
   "outputs": [],
   "source": [
    "best_features_pipeline_all_variables = best_features\n",
    "best_features_pipeline_all_variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "J2ywCxJmkRQn"
   },
   "source": [
    "## Cluster Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "hZMr-wiudEkb"
   },
   "source": [
    "Load function that plots a table with description for all Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_lpRVDqTdEul"
   },
   "outputs": [],
   "source": [
    "\n",
    "def DescriptionAllClusters(df, decimal_points=3):\n",
    "\n",
    "    DescriptionAllClusters = pd.DataFrame(\n",
    "        columns=df.drop(['Clusters'], axis=1).columns)\n",
    "    # iterate on each cluster , calls Clusters_IndividualDescription()\n",
    "    for cluster in df.sort_values(by='Clusters')['Clusters'].unique():\n",
    "\n",
    "        EDA_ClusterSubset = df.query(\n",
    "            f\"Clusters == {cluster}\").drop(['Clusters'], axis=1)\n",
    "        ClusterDescription = Clusters_IndividualDescription(\n",
    "            EDA_ClusterSubset, cluster, decimal_points)\n",
    "        DescriptionAllClusters = pd.concat(\n",
    "            [ClusterDescription, DescriptionAllClusters])\n",
    "\n",
    "    DescriptionAllClusters.set_index(['Cluster'], inplace=True)\n",
    "    return DescriptionAllClusters\n",
    "\n",
    "\n",
    "def Clusters_IndividualDescription(EDA_Cluster, cluster, decimal_points):\n",
    "\n",
    "    ClustersDescription = pd.DataFrame(columns=EDA_Cluster.columns)\n",
    "    # for a given cluster, iterate over all columns\n",
    "    # if the variable is numerical, calculate the IQR: display as Q1 -- Q3.\n",
    "    # That will show the range for the most common values for the numerical variable\n",
    "    # if the variable is categorical, count the frequencies and displays the top 3 most frequent\n",
    "    # That will show the most common levels for the category\n",
    "\n",
    "    for col in EDA_Cluster.columns:\n",
    "\n",
    "        try:  # eventually a given cluster will have only missing data for a given variable\n",
    "\n",
    "            if EDA_Cluster[col].dtypes == 'object':\n",
    "\n",
    "                top_frequencies = EDA_Cluster.dropna(\n",
    "                    subset=[col])[[col]].value_counts(normalize=True).nlargest(n=3)\n",
    "                Description = ''\n",
    "\n",
    "                for x in range(len(top_frequencies)):\n",
    "                    freq = top_frequencies.iloc[x]\n",
    "                    category = top_frequencies.index[x][0]\n",
    "                    CategoryPercentage = int(round(freq*100, 0))\n",
    "                    statement = f\"'{category}': {CategoryPercentage}% , \"\n",
    "                    Description = Description + statement\n",
    "\n",
    "                ClustersDescription.at[0, col] = Description[:-2]\n",
    "\n",
    "            elif EDA_Cluster[col].dtypes in ['float', 'int']:\n",
    "                DescStats = EDA_Cluster.dropna(subset=[col])[[col]].describe()\n",
    "                Q1 = round(DescStats.iloc[4, 0], decimal_points)\n",
    "                Q3 = round(DescStats.iloc[6, 0], decimal_points)\n",
    "                Description = f\"{Q1} -- {Q3}\"\n",
    "                ClustersDescription.at[0, col] = Description\n",
    "\n",
    "        except Exception as e:\n",
    "            ClustersDescription.at[0, col] = 'Not available'\n",
    "            print(\n",
    "                f\"** Error Exception: {e} - cluster {cluster}, variable {col}\")\n",
    "\n",
    "    ClustersDescription['Cluster'] = str(cluster)\n",
    "\n",
    "    return ClustersDescription\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OHo7wmH68AYc"
   },
   "source": [
    "Load a custom function to plot cluster distribution per Variable (absolute and relative levels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "NN23X2dT8AeA"
   },
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "\n",
    "\n",
    "def cluster_distribution_per_variable(df, target):\n",
    "    \"\"\"\n",
    "    The data should have 2 variables, the cluster predictions and\n",
    "    the variable you want to analyze with, in this case we call \"target\".\n",
    "    We use plotly express to create 2 plots:\n",
    "    Cluster distribution across the target.\n",
    "    Relative presence of the target level in each cluster.\n",
    "    \"\"\"\n",
    "    df_bar_plot = df.groupby(['Clusters', target]).size().reset_index(name='Count')\n",
    "    df_bar_plot.columns = ['Clusters', target, 'Count']\n",
    "    df_bar_plot[target] = df_bar_plot[target].astype('object')\n",
    "\n",
    "    print(f\"Clusters distribution across {target} levels\")\n",
    "    fig = px.bar(df_bar_plot, x='Clusters', y='Count',\n",
    "                 color=target, width=800, height=500)\n",
    "    fig.update_layout(xaxis=dict(tickmode='array',\n",
    "                      tickvals=df['Clusters'].unique()))\n",
    "    fig.show(renderer='jupyterlab')\n",
    "\n",
    "    df_relative = (df\n",
    "                   .groupby([\"Clusters\", target])\n",
    "                   .size()\n",
    "                   .unstack(fill_value=0)\n",
    "                   .apply(lambda x: 100 * x / x.sum(), axis=1)\n",
    "                   .stack()\n",
    "                   .reset_index(name='Relative Percentage (%)')\n",
    "                   .sort_values(by=['Clusters', target])\n",
    "                   )\n",
    "\n",
    "    print(f\"Relative Percentage (%) of {target} in each cluster\")\n",
    "    fig = px.line(df_relative, x='Clusters', y='Relative Percentage (%)',\n",
    "                  color=target, width=800, height=500)\n",
    "    fig.update_layout(xaxis=dict(tickmode='array',\n",
    "                      tickvals=df['Clusters'].unique()))\n",
    "    fig.update_traces(mode='markers+lines')\n",
    "    fig.show(renderer='jupyterlab')\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "73J7J65v4O_d"
   },
   "source": [
    "Create a DataFrame that contains best features and Clusters Predictions since we want to analyse the patterns for each cluster.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PztdhjGl4Vkg"
   },
   "outputs": [],
   "source": [
    "df_cluster_profile = df_clf.copy()\n",
    "df_cluster_profile = df_cluster_profile.filter(items=best_features + ['Clusters'], axis=1)\n",
    "print(df_cluster_profile.shape)\n",
    "df_cluster_profile.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-mfJRrFc7wzu"
   },
   "source": [
    "We want also to analyse Churn levels."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "fSRSNqiF4mnm"
   },
   "outputs": [],
   "source": [
    "df_churn = pd.read_csv(\"outputs/datasets/collection/TelcoCustomerChurn.csv\").filter(['Churn'])\n",
    "df_churn['Churn'] = df_churn['Churn'].astype('object')\n",
    "df_churn.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KtD0Y3NdJOhm"
   },
   "source": [
    "### Cluster profile based on the best features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "LDhycaSEdORm"
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n",
    "clusters_profile = DescriptionAllClusters(df=pd.concat([df_cluster_profile,df_churn], axis=1), decimal_points=0)\n",
    "clusters_profile"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "2SS6CCCb74lH"
   },
   "source": [
    "### Clusters distribution across Churn levels & Relative Percentage of Churn in each cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "kwEUdPI2NHOb"
   },
   "outputs": [],
   "source": [
    "df_cluster_vs_churn=  df_churn.copy()\n",
    "df_cluster_vs_churn['Clusters'] = X['Clusters']\n",
    "cluster_distribution_per_variable(df=df_cluster_vs_churn, target='Churn')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "iEfOvjx8ZhAH"
   },
   "source": [
    "# Fit New Cluster Pipeline with most important features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MFno4XnSZlZV"
   },
   "source": [
    "In order to reduce feature space, we will study the trade-off between the previous Cluster Pipeline (fitted with all variables) and Pipeline using the variables that are most important to define the clusters from the previous pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ROH8cre2PHWx"
   },
   "outputs": [],
   "source": [
    "best_features_pipeline_all_variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "cLOL2zr4Jr68"
   },
   "source": [
    "## Define trade-off and metrics to compare new and previous Cluster Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "eJyxkSowZ9Cm"
   },
   "source": [
    "To evaluate this trade-off we will\n",
    "1. Conduct a elbow method and silhouette analysis and check if the same number of clusters is suggested\n",
    "2. Fit new cluster pipeline and compare if the predictions from this pipeline are \"equivalent\" to the predictions from the previous pipeline\n",
    "3. Fit a classifier to explain cluster, and check if performance on Train and Test sets is similar to the previous pipeline\n",
    "4. Check if the most important features for the classifier are the same from the previous pipeline\n",
    "5. Compare if the cluster profile from both pipelines are \"equivalent\"\n",
    "\n",
    "If we are happy to say **yes** for them, we can use a cluster pipeline using the features that best define the clusters from previous pipeline!\n",
    "* The **gain** is that in real-time (which is the major purpose of Machine Learning) you will need fewer variables for predicting clusters for your prospects."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DxyTpm4EJx4s"
   },
   "source": [
    "## Subset data with the most relevant variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "lQu9033oZ6r9"
   },
   "outputs": [],
   "source": [
    "df_reduced = df.filter(best_features_pipeline_all_variables)\n",
    "df_reduced.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_ub9_YoIeaS5"
   },
   "source": [
    "## Rewrite Cluster Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PrlQuieZeaS6"
   },
   "outputs": [],
   "source": [
    "def PipelineCluster():\n",
    "    pipeline_base = Pipeline([\n",
    "\n",
    "        # we update the pipeline, considering only the most important variables from the previous pipeline\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['OnlineBackup', 'PhoneService'])),\n",
    "\n",
    "        # it doesn't need SmartCorrelation\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        # No PCA step needed, since we know which features to consider\n",
    "\n",
    "        (\"model\", KMeans(n_clusters=3, random_state=0)),\n",
    "\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n",
    "\n",
    "\n",
    "PipelineCluster()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "D57ncdQ7hBXe"
   },
   "source": [
    "## Apply Elbow Method and Silhouette analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "D0wqQOM3hBXr"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster = PipelineCluster()\n",
    "pipeline_analysis = Pipeline(pipeline_cluster.steps[:-1])\n",
    "df_analysis = pipeline_analysis.fit_transform(df_reduced)\n",
    "\n",
    "print(df_analysis.shape,'\\n', type(df_analysis))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "r_1H05FKhBXs"
   },
   "source": [
    "Elbow Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "nsAJW4s0hBXt"
   },
   "outputs": [],
   "source": [
    "from yellowbrick.cluster import KElbowVisualizer\n",
    "visualizer = KElbowVisualizer(KMeans(random_state=0), k=(1,11))\n",
    "visualizer.fit(df_analysis) \n",
    "visualizer.show() \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from yellowbrick.cluster import SilhouetteVisualizer\n",
    "\n",
    "n_cluster_start, n_cluster_stop = 2, 5\n",
    "\n",
    "print(\"=== Average Silhouette Score for different number of clusters ===\")\n",
    "visualizer = KElbowVisualizer(KMeans(random_state=0), k=(\n",
    "    n_cluster_start, n_cluster_stop), metric='silhouette')\n",
    "visualizer.fit(df_analysis)\n",
    "visualizer.show()\n",
    "plt.show()\n",
    "print(\"\\n\")\n",
    "\n",
    "\n",
    "for n_clusters in np.arange(start=n_cluster_start, stop=n_cluster_stop):\n",
    "\n",
    "    print(f\"=== Silhouette plot for {n_clusters} Clusters ===\")\n",
    "    visualizer = SilhouetteVisualizer(estimator=KMeans(n_clusters=n_clusters, random_state=0),\n",
    "                                      colors='yellowbrick')\n",
    "    visualizer.fit(df_analysis)\n",
    "    visualizer.show()\n",
    "    plt.show()\n",
    "    print(\"\\n\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "f_T1gtprhe8W"
   },
   "source": [
    "## Fit New Cluster Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ZEtzurhOhe8W"
   },
   "source": [
    "We set X as our training set for the cluster. It is a copy of df_reduced"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "BIZRR3wChe8X"
   },
   "outputs": [],
   "source": [
    "X = df_reduced.copy()\n",
    "print(X.shape)\n",
    "X.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "O-z3ST3nhe8Z"
   },
   "source": [
    "Fit Cluster pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "AAaEPy3Uhe8Z"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster = PipelineCluster()\n",
    "pipeline_cluster.fit(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_Pdrt5bdKLDF"
   },
   "source": [
    "## Add cluster predictions to dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "CdFwZk1ihe8b"
   },
   "source": [
    "We add a column \"`Clusters`\" (with the cluster pipeline predictions) to the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['Clusters'] = pipeline_cluster['model'].labels_\n",
    "print(X.shape)\n",
    "X.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "wUwR8vCEhe8b"
   },
   "outputs": [],
   "source": [
    "print(f\"* Clusters frequencies \\n{ X['Clusters'].value_counts(normalize=True).to_frame().round(2)} \\n\\n\")\n",
    "X['Clusters'].value_counts().sort_values().plot(kind='bar')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "QUxjeMypKOUe"
   },
   "source": [
    "## Compare current cluster predictions to previous cluster predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w6721kuGiII6"
   },
   "source": [
    "We just fitted a new cluster pipeline and want to compare if its predictions are \"equivalent\" to the previous cluster."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "GAS2dDXQhe8c"
   },
   "source": [
    "These are the predictions from the **previous** cluster pipeline - trained with all variables "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pbVaUAABhe8c"
   },
   "outputs": [],
   "source": [
    "cluster_predictions_with_all_variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "xUiHzLIwimaD"
   },
   "source": [
    "And these are the predictions from **current** cluster pipeline (trained with `['OnlineBackup', 'MonthlyCharges', 'PhoneService']`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_kC7cKKCiwD5"
   },
   "outputs": [],
   "source": [
    "cluster_predictions_with_best_features = X['Clusters'] \n",
    "cluster_predictions_with_best_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3Jt-2n1GidBa"
   },
   "source": [
    "We use a confusion matrix to evaluate if the predictions of both pipelines are **\"equivalent\"**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tLy37N1TiiSx"
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "print(confusion_matrix(cluster_predictions_with_all_variables, cluster_predictions_with_best_features))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "fcPfalkwmomc"
   },
   "source": [
    "## Fit a classifier, where the target is cluster predictions and features remaining variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_XoL6tuRmomf"
   },
   "outputs": [],
   "source": [
    "df_clf = X.copy()\n",
    "print(df_clf.shape)\n",
    "df_clf.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PSfwpL-Fmomf"
   },
   "source": [
    "Split Train and Test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "sPyXs27Kmomf"
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    df_clf.drop(['Clusters'], axis=1),\n",
    "    df_clf['Clusters'],\n",
    "    test_size=0.2,\n",
    "    random_state=0\n",
    ")\n",
    "\n",
    "print(X_train.shape, X_test.shape)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0W-cV2ts8A_N"
   },
   "source": [
    "Rewrite pipeline to explain clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Lm63GRYP8BIV"
   },
   "outputs": [],
   "source": [
    "def PipelineClf2ExplainClusters():\n",
    "    pipeline_base = Pipeline([\n",
    "\n",
    "        (\"OrdinalCategoricalEncoder\", OrdinalEncoder(encoding_method='arbitrary',\n",
    "                                                     variables=['OnlineBackup', 'PhoneService'])),\n",
    "\n",
    "        # it doesn't need SmartCorrelation\n",
    "\n",
    "        (\"scaler\", StandardScaler()),\n",
    "\n",
    "        # we don't consider feature selection step, since we know which features to consider\n",
    "\n",
    "        (\"model\", GradientBoostingClassifier(random_state=0)),\n",
    "\n",
    "    ])\n",
    "    return pipeline_base\n",
    "\n",
    "\n",
    "PipelineClf2ExplainClusters()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "lkaBhgjOmomg"
   },
   "source": [
    "## Fit a classifier, where the target is cluster labels and features remaining variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zU6mwsFYmomg"
   },
   "source": [
    "Create and fit a classifier pipeline to learn the feature importance when defining a cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "w3liI7qjmomg"
   },
   "outputs": [],
   "source": [
    "pipeline_clf_cluster = PipelineClf2ExplainClusters()\n",
    "pipeline_clf_cluster.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6hCk6Swrmomh"
   },
   "source": [
    "## Evaluate classifier performance on Train and Test Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ZjXpF0x8momh"
   },
   "outputs": [],
   "source": [
    "print(classification_report(y_train, pipeline_clf_cluster.predict(X_train)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "l-Obn_Hcmomh"
   },
   "outputs": [],
   "source": [
    "print(classification_report(y_test, pipeline_clf_cluster.predict(X_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "G07251XWmomh"
   },
   "source": [
    "## Assess Most Important Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "IMTUNBYN8fyf"
   },
   "outputs": [],
   "source": [
    "# since we don't have feature selection step in this pipeline, best_features is Xtrain columns\n",
    "best_features = X_train.columns.to_list()\n",
    "\n",
    "# create a DataFrame to display feature importance\n",
    "df_feature_importance = (pd.DataFrame(data={\n",
    "    'Feature': best_features,\n",
    "    'Importance': pipeline_clf_cluster['model'].feature_importances_})\n",
    "    .sort_values(by='Importance', ascending=False)\n",
    ")\n",
    "\n",
    "best_features = df_feature_importance['Feature'].to_list()\n",
    "\n",
    "# Most important features statement and plot\n",
    "print(f\"* These are the {len(best_features)} most important features in descending order. \"\n",
    "      f\"The model was trained on them: \\n{df_feature_importance['Feature'].to_list()}\")\n",
    "\n",
    "df_feature_importance.plot(kind='bar', x='Feature', y='Importance')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9q1TJSqdI6xK"
   },
   "source": [
    "## Cluster Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "C8EMIqE5I6xP"
   },
   "source": [
    "Create a DataFrame that contains the best features and Clusters Predictions: we want to analyse the patterns for each cluster.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "TEg92vdnI6xP"
   },
   "outputs": [],
   "source": [
    "df_cluster_profile = df_clf.copy()\n",
    "df_cluster_profile = df_cluster_profile.filter(items=best_features + ['Clusters'], axis=1)\n",
    "df_cluster_profile.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "789CeA0WI6xQ"
   },
   "source": [
    "We want also to analyse Churn levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Nx335aqtI6xR"
   },
   "outputs": [],
   "source": [
    "df_churn = pd.read_csv(\"outputs/datasets/collection/TelcoCustomerChurn.csv\").filter(['Churn'])\n",
    "df_churn['Churn'] = df_churn['Churn'].astype('object')\n",
    "df_churn.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "f-7jYzhtI6xR"
   },
   "source": [
    "### Cluster profile on most important features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "urBmw5HJI6xS"
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n",
    "clusters_profile = DescriptionAllClusters(df= pd.concat([df_cluster_profile,df_churn], axis=1), decimal_points=0)\n",
    "clusters_profile"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "LDJRuBBgI6xS"
   },
   "source": [
    "### Clusters distribution across Churn levels & Relative Percentage of Churn in each cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ZjMnAqYKI6xS"
   },
   "outputs": [],
   "source": [
    "df_cluster_vs_churn=  df_churn.copy()\n",
    "df_cluster_vs_churn['Clusters'] = X['Clusters']\n",
    "cluster_distribution_per_variable(df=df_cluster_vs_churn, target='Churn')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4xhPLbC4dwXL"
   },
   "source": [
    "## Which pipeline should I deploy?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-8qASh5k1jph"
   },
   "source": [
    "Let's recap the criteria we consider to evaluate the **trade-off**\n",
    "1. Conduct an elbow method and silhouette analysis and check if the same number of clusters is suggested.\n",
    "2. Fit a new cluster pipeline and compare if the predictions from this pipeline are \"equivalent\" to the predictions from the previous pipeline.\n",
    "3. Fit a classifier to explain cluster and check if performance on Train and Test sets is similar to the previous pipeline.\n",
    "4. Check if the most important features for the classifier are the same from the previous pipeline.\n",
    "5. Compare if the cluster profile from both pipelines is \"equivalent\".\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "a4HsuSuqd0g_"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4zxpjktKd1n6"
   },
   "source": [
    "# Push files to Repo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5i9X1oOORAQc"
   },
   "source": [
    "\n",
    "We will generate the following files\n",
    "\n",
    "* Cluster Pipeline\n",
    "* Train Set\n",
    "* Feature importance plot\n",
    "* Clusters Description\n",
    "* Cluster Silhouette\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5ySBIrV1Q4cY"
   },
   "outputs": [],
   "source": [
    "import joblib\n",
    "import os\n",
    "\n",
    "version = 'v1'\n",
    "file_path = f'outputs/ml_pipeline/cluster_analysis/{version}'\n",
    "\n",
    "try:\n",
    "    os.makedirs(name=file_path)\n",
    "except Exception as e:\n",
    "    print(e)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6y9-0fisd5cl"
   },
   "source": [
    "## Cluster pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Xfv9k5xMd7fv"
   },
   "outputs": [],
   "source": [
    "pipeline_cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "IsphnIR84hJ4"
   },
   "outputs": [],
   "source": [
    "joblib.dump(value=pipeline_cluster, filename=f\"{file_path}/cluster_pipeline.pkl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_ORnkwG6d74O"
   },
   "source": [
    "## Train Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "QqcwHaVwd9Ff"
   },
   "outputs": [],
   "source": [
    "print(df_reduced.shape)\n",
    "df_reduced.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "M26MiJ9Y485Q"
   },
   "outputs": [],
   "source": [
    "df_reduced.to_csv(f\"{file_path}/TrainSet.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "eX_lcQVXaV0p"
   },
   "source": [
    "## Most important features plot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "o9datNfsLCVV"
   },
   "source": [
    "These are the features that define a cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "FYeoH7fjaV8J"
   },
   "outputs": [],
   "source": [
    "df_feature_importance.plot(kind='bar',x='Feature',y='Importance', figsize=(8,4))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "sr0cVVQsaZqk"
   },
   "outputs": [],
   "source": [
    "df_feature_importance.plot(kind='bar',x='Feature',y='Importance', figsize=(8,4))\n",
    "plt.savefig(f\"{file_path}/features_define_cluster.png\", bbox_inches='tight', dpi=150)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "GX3Z5ivNd9mw"
   },
   "source": [
    "## Cluster Profile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tw-mEnI8d_Bv"
   },
   "outputs": [],
   "source": [
    "clusters_profile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "7G5CsAl738p7"
   },
   "outputs": [],
   "source": [
    "clusters_profile.to_csv(f\"{file_path}/clusters_profile.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "RObeac1HQq5a"
   },
   "source": [
    "## Cluster silhouette plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "visualizer = SilhouetteVisualizer(Pipeline(pipeline_cluster.steps[-1:])[0] , colors='yellowbrick')\n",
    "visualizer.fit(df_analysis)\n",
    "visualizer.show()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(figsize=(7,5))\n",
    "fig = SilhouetteVisualizer(Pipeline(pipeline_cluster.steps[-1:])[0] , colors='yellowbrick', ax=axes)\n",
    "fig.fit(df_analysis)\n",
    "\n",
    "plt.savefig(f\"{file_path}/clusters_silhouette.png\", bbox_inches='tight',dpi=150)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "TeIEUrWkJ-6_"
   },
   "source": [
    "Good job, clear the cell outputs, run git commands to add, commit and push files to the repo. Next, we will move on to create our dashboard!"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "name": "Modeling and Evaluation - Cluster Sklearn.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "interpreter": {
   "hash": "8b8334dab9339717f727a1deaf837b322d7a41c20d15cc86be99a8e69ceec8ce"
  },
  "kernelspec": {
   "display_name": "Python 3.8.12 64-bit ('3.8.12': pyenv)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12 (default, Sep 27 2022, 15:56:02) \n[GCC 9.4.0]"
  },
  "orig_nbformat": 2
 },
 "nbformat": 4,
 "nbformat_minor": 2
}