### Churn Prediction 🏃‍💨

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/kaggle/input/churn-modeling-dataset/Churn_Modelling.csv")
df.head()

<blockquote><p style="font-size:16px; color:#159364; font-family:verdana;">💬 We don't need of the <b>RowNumber, CustomerId, Surname </b> columns, <code> so just drop 🗑</code></p></blockquote>


In [None]:
df.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)
df.shape

### Missing Values

In [None]:
df.isna().sum()

<blockquote><p style="font-size:16px; color:#159364; font-family:verdana;">💬The data is balanced?</p></blockquote>

In [None]:
df['Exited'].value_counts(normalize=True)

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp; Only 20% of the Exited labels are of the type 1 in the whole dataset, so we need  to apply some data balancing technique. In this notebook we will use the SMOTE technique.
</div>

### Show data by geografy

In [None]:
import seaborn as sns

ax = sns.countplot(x="Geography", data=df)

### Split the data in train and validation data

In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp; Note that we have an imbalanced dataset, so "it is desirable to split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset". We can do this using the <code> stratify </code> parameter.
</div>

[**Reference**](https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/)

In [None]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
x_train, x_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0, stratify=y)

<blockquote><p style="font-size:16px; color:#159364; font-family:verdana;">💬 Select Numerical and Categorical columns</p></blockquote>

In [None]:
categorical_cols = ['Geography', 'Gender']

# Select numerical columns
numerical_cols = ['CreditScore',
                  'Age',
                  'Tenure',
                  'Balance',
                  'NumOfProducts',
                  'HasCrCard',
                  'IsActiveMember',
                  'EstimatedSalary']

### Using ColumnTransformer

In [None]:
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state=42)

### Define the HistGradientBoostingClassifier model

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier()

### Create the Pipeline

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp; The pipeline have the following steps: <b> Preprocessing, SMOTE, Model</b> .
</div>

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('smote', smt),
                            ('model', model)
                          ])

# Preprocessing of training data, fit model 
pipeline.fit(x_train, y_train)

[**Reference**](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.pipeline.Pipeline.html)

### Predict

In [None]:
y_pred = pipeline.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred))

### New data

In [None]:
new_data = pd.DataFrame({'CreditScore': 500, 'Geography': 'Spain', 'Gender': 'Female', 'Age': 30,
                  'Tenure': 1, 'Balance': 0., 'NumOfProducts': 2, 'HasCrCard': 0, 'IsActiveMember': 1, 
                  'EstimatedSalary': 10258.2}, index=[0])
new_data

In [None]:
pipeline.predict(new_data)