# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd

---
## 2. Load data

In [2]:
df = pd.read_csv('./client_data.csv')
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [3]:
df.head(3)

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,t,0.0,25.44,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,f,0.0,16.38,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.8,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,f,0.0,28.6,28.6,1,6.6,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,0


---

## 3. Feature engineering

In [4]:
price_df = pd.read_csv('price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0


In [5]:
price_df.isnull().sum()

id                    0
price_date            0
price_off_peak_var    0
price_peak_var        0
price_mid_peak_var    0
price_off_peak_fix    0
price_peak_fix        0
price_mid_peak_fix    0
dtype: int64

### Sum of price for energy and power along with the difference with respect to the period
The changes in the price from one period to another is a good indicator for churn. The expected thing to notice is that SME who churned will have a positive difference meaning that the prices for energy and power are getting high. The inverse is expected for SME who did not churn.

In [6]:
price_df['price_p1']=price_df['price_off_peak_var']+price_df['price_off_peak_fix']
price_df['price_p2']=price_df['price_peak_var']+price_df['price_peak_fix']
price_df['price_p3']=price_df['price_mid_peak_var']+price_df['price_mid_peak_fix']
price_df['pp12']=price_df['price_p2']-price_df['price_p1']
price_df['pp23']=price_df['price_p3']-price_df['price_p2']
price_df['pp13']=price_df['price_p3']-price_df['price_p1']

In [7]:
price_df.isnull().sum()

id                    0
price_date            0
price_off_peak_var    0
price_peak_var        0
price_mid_peak_var    0
price_off_peak_fix    0
price_peak_fix        0
price_mid_peak_fix    0
price_p1              0
price_p2              0
price_p3              0
pp12                  0
pp23                  0
pp13                  0
dtype: int64

### Difference between off-peak prices in December and preceding January

Below is the code created by your colleague to calculate the feature described above. Use this code to re-create this feature and then think about ways to build on this feature to create features with a higher predictive power.

In [8]:
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean', 'price_off_peak_fix': 'mean'}).reset_index()

# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff_1 = pd.merge(dec_prices.rename(columns={'price_off_peak_var': 'dec_1', 'price_off_peak_fix': 'dec_2'}), jan_prices.rename(columns={'price_off_peak_var': 'jan_1', 'price_off_peak_fix': 'jan_2'}).drop(columns='price_date'), on='id')
diff_1['diff_dec_january_energy_p1'] = diff_1['dec_1'] - diff_1['jan_1']
diff_1['diff_dec_january_power_p1'] = diff_1['dec_2'] - diff_1['jan_2']
diff_1 = diff_1[['id', 'diff_dec_january_energy_p1','diff_dec_january_power_p1']]
diff_1.head()

Unnamed: 0,id,diff_dec_january_energy_p1,diff_dec_january_power_p1
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06


### Difference between peak prices in December and preceding January

In [9]:
# Group peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_peak_var': 'mean', 'price_peak_fix': 'mean'}).reset_index()

# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff_2 = pd.merge(dec_prices.rename(columns={'price_peak_var': 'dec_1', 'price_peak_fix': 'dec_2'}), jan_prices.rename(columns={'price_peak_var': 'jan_1', 'price_peak_fix': 'jan_2'}).drop(columns='price_date'), on='id')
diff_2['diff_dec_january_energy_p2'] = diff_2['dec_1'] - diff_2['jan_1']
diff_2['diff_dec_january_power_p2'] = diff_2['dec_2'] - diff_2['jan_2']
diff_2 = diff_2[['id', 'diff_dec_january_energy_p2','diff_dec_january_power_p2']]
diff_2.head()

Unnamed: 0,id,diff_dec_january_energy_p2,diff_dec_january_power_p2
0,0002203ffbb812588b632b9e628cc38d,-0.002302,0.097749
1,0004351ebdd665e6ee664792efc4fd13,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,-0.00512,0.097749
4,00114d74e963e47177db89bc70108537,0.0,0.0


### Difference between mid-peak prices in December and preceding January

In [10]:
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_mid_peak_var': 'mean', 'price_mid_peak_fix': 'mean'}).reset_index()

# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff_3 = pd.merge(dec_prices.rename(columns={'price_mid_peak_var': 'dec_1', 'price_mid_peak_fix': 'dec_2'}), jan_prices.rename(columns={'price_mid_peak_var': 'jan_1', 'price_mid_peak_fix': 'jan_2'}).drop(columns='price_date'), on='id')
diff_3['diff_dec_january_energy_p3'] = diff_3['dec_1'] - diff_3['jan_1']
diff_3['diff_dec_january_power_p3'] = diff_3['dec_2'] - diff_3['jan_2']
diff_3 = diff_3[['id', 'diff_dec_january_energy_p3','diff_dec_january_power_p3']]
diff_3.head()

Unnamed: 0,id,diff_dec_january_energy_p3,diff_dec_january_power_p3
0,0002203ffbb812588b632b9e628cc38d,0.003487,0.065166
1,0004351ebdd665e6ee664792efc4fd13,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,0.000763,0.065166
4,00114d74e963e47177db89bc70108537,0.0,0.0


In [11]:
diff = pd.merge(pd.merge(diff_1, diff_2, on='id'), diff_3, on='id')
diff.head()

Unnamed: 0,id,diff_dec_january_energy_p1,diff_dec_january_power_p1,diff_dec_january_energy_p2,diff_dec_january_power_p2,diff_dec_january_energy_p3,diff_dec_january_power_p3
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.0,0.0,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0.0,0.0,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916,-0.00512,0.097749,0.000763,0.065166
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0.0,0.0,0.0,0.0


### Tenure: The duration an SME has been a client for PowerCo
Tenure is a very important factor for predicting churn. it is the duration of business between the client and PowerCo.

In [12]:
df['tenure'] = (df['date_end']-df['date_activ']).dt.days/365
df['tenure']

0        3.002740
1        7.030137
2        6.005479
3        6.005479
4        6.150685
           ...   
14601    3.958904
14602    4.002740
14603    4.000000
14604    4.002740
14605    7.002740
Name: tenure, Length: 14606, dtype: float64

### The deviation of last month consumption relative to the mean consumption for one month
This feature aims at detecting any change of pattern in the consumption of the client for the last month.

In [13]:
df['cons_dev']=(df['cons_12m']/12)-df['cons_last_month']
df['cons_dev'].describe()

count     14606.000000
mean      -2821.912564
std       21686.353629
min     -449656.000000
25%        -496.312500
50%         105.125000
75%         757.375000
max      194525.250000
Name: cons_dev, dtype: float64

In [14]:
df['cons_dev'].isnull().sum()

0

### Ratio of consumption for the next year compared to the current year
This feature shows whether the client is expected to have a larger or lower demand for energy and power in the future compared to his current consumption.

In [15]:
df['cons_pattern']=df['forecast_cons_12m']/df['cons_12m']
df['cons_pattern'].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    1.450600e+04
mean              inf
std               NaN
min      0.000000e+00
25%      4.496907e-02
50%      1.024782e-01
75%      1.475232e-01
max               inf
Name: cons_pattern, dtype: float64

There are extremely large values. This issue is resolved in the following:

In [16]:
def handleInf(x):
    if x==float('-inf') or x==float('inf'):
        return 0
    else:
        return x

In [17]:
df.cons_pattern=df.cons_pattern.apply(handleInf)

In [18]:
df['cons_pattern'].describe()

count    14506.000000
mean         0.093761
std          0.054838
min          0.000000
25%          0.044747
50%          0.102414
75%          0.147496
max          0.624622
Name: cons_pattern, dtype: float64

In [19]:
df['cons_pattern'].isnull().sum()

100

There are a 100 missing values! Let's see what all of that is about

In [20]:
df[df['cons_pattern'].isnull()][['forecast_cons_12m', 'cons_12m', 'cons_pattern']]

Unnamed: 0,forecast_cons_12m,cons_12m,cons_pattern
0,0.0,0,
135,0.0,0,
268,0.0,0,
344,0.0,0,
392,0.0,0,
...,...,...,...
13791,0.0,0,
13899,0.0,0,
13999,0.0,0,
14144,0.0,0,


It's a division by 0. Missing values will be replaced by 0.

In [21]:
df['cons_pattern'] = df['cons_pattern'].fillna(0)


In [22]:
df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn,tenure,cons_dev,cons_pattern
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,1,3.00274,0.0,0.0
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.8,0,7.030137,388.333333,0.040762
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,28.6,1,6.6,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,0,6.005479,45.333333,0.088162
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,30.22,1,25.46,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,0,6.005479,132.0,0.15154
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,44.91,1,47.98,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,19.8,0,6.150685,-157.25,0.100734


Date features will dropped as they are summarized in a more representative feature that is `tenure`.

In [23]:
train = pd.merge(price_df.drop(['price_date', 'price_off_peak_var', 'price_off_peak_fix', 'price_peak_var',
                                'price_peak_fix', 'price_mid_peak_var', 'price_mid_peak_fix'], axis=1), 
                 df.drop(['date_activ', 'date_end', 'date_modif_prod', 'date_renewal'], axis=1), on='id')
train.head()

Unnamed: 0,id,price_p1,price_p2,price_p3,pp12,pp23,pp13,channel_sales,cons_12m,cons_gas_12m,...,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn,tenure,cons_dev,cons_pattern
0,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,21.52,1,52.53,3,ldkssxwpmemidmecebumciepifcamkci,13.2,0,3.00274,-332.0,0.14863
1,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,21.52,1,52.53,3,ldkssxwpmemidmecebumciepifcamkci,13.2,0,3.00274,-332.0,0.14863
2,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,21.52,1,52.53,3,ldkssxwpmemidmecebumciepifcamkci,13.2,0,3.00274,-332.0,0.14863
3,038af19179925da21a25619c5a24b745,44.416557,0.0,0.0,-44.416557,0.0,-44.416557,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,21.52,1,52.53,3,ldkssxwpmemidmecebumciepifcamkci,13.2,0,3.00274,-332.0,0.14863
4,038af19179925da21a25619c5a24b745,44.416557,0.0,0.0,-44.416557,0.0,-44.416557,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,21.52,1,52.53,3,ldkssxwpmemidmecebumciepifcamkci,13.2,0,3.00274,-332.0,0.14863


In [24]:
train = pd.merge(train, diff, on='id')
train.head()

Unnamed: 0,id,price_p1,price_p2,price_p3,pp12,pp23,pp13,channel_sales,cons_12m,cons_gas_12m,...,churn,tenure,cons_dev,cons_pattern,diff_dec_january_energy_p1,diff_dec_january_power_p1,diff_dec_january_energy_p2,diff_dec_january_power_p2,diff_dec_january_energy_p3,diff_dec_january_power_p3
0,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,0,3.00274,-332.0,0.14863,-0.005508,0.177779,0.0,0.0,0.0,0.0
1,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,0,3.00274,-332.0,0.14863,-0.005508,0.177779,0.0,0.0,0.0,0.0
2,038af19179925da21a25619c5a24b745,44.418298,0.0,0.0,-44.418298,0.0,-44.418298,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,0,3.00274,-332.0,0.14863,-0.005508,0.177779,0.0,0.0,0.0,0.0
3,038af19179925da21a25619c5a24b745,44.416557,0.0,0.0,-44.416557,0.0,-44.416557,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,0,3.00274,-332.0,0.14863,-0.005508,0.177779,0.0,0.0,0.0,0.0
4,038af19179925da21a25619c5a24b745,44.416557,0.0,0.0,-44.416557,0.0,-44.416557,foosdfpfkusacimwkcsosbicdxkicaua,3576,0,...,0,3.00274,-332.0,0.14863,-0.005508,0.177779,0.0,0.0,0.0,0.0


## Modelling
Now, the models are almost ready to be trained. First we import the libraries.

In [25]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

In [26]:
train.shape

(175149, 37)

In [27]:
train.drop_duplicates(inplace=True)
train.shape

(56369, 37)

In [28]:
y = train['churn']
X = train.drop(['churn'], axis=1).set_index('id')
X.dtypes

price_p1                          float64
price_p2                          float64
price_p3                          float64
pp12                              float64
pp23                              float64
pp13                              float64
channel_sales                      object
cons_12m                            int64
cons_gas_12m                        int64
cons_last_month                     int64
forecast_cons_12m                 float64
forecast_cons_year                  int64
forecast_discount_energy          float64
forecast_meter_rent_12m           float64
forecast_price_energy_off_peak    float64
forecast_price_energy_peak        float64
forecast_price_pow_off_peak       float64
has_gas                            object
imp_cons                          float64
margin_gross_pow_ele              float64
margin_net_pow_ele                float64
nb_prod_act                         int64
net_margin                        float64
num_years_antig                   

In [29]:
X['has_gas'] = X['has_gas'].map({'f': 0, 't': 1})

`channel_sales` & `origin_up` are encoded using OneHotEncoder.

In [30]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [6,24])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

We split the data into training and test sets of sizes 75% and 25% respectively. Making sure that the target variable is similarly distributed in both sets.

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [32]:
X_train.shape, X_test.shape

((42276, 47), (14093, 47))

In [33]:
y_train.shape, y_test.shape

((42276,), (14093,))

---
The following function will be used to evaluate the model performance. The evaluation metrics chosen are:
- **accuracy:** however it is not representative for the model performance as predicting all 0's will give an accuracy above 90%. This is why the next evaluation metrics are useed.
- **Precision**: it indicates how many of the predicted as `churned` are actually true.
- **recall**: it explains how many of the actual positive cases we were able to detect with our model.
- **f1 score**: It gives a combined idea about Precision and Recall metrics. It is maximum when Precision is equal to Recall.
- **ROC AUC**: The Receiver Operator Characteristic (ROC) is a probability curve that plots the TPR(True Positive Rate) against the FPR(False Positive Rate) at various threshold values and separates the ‘signal’ from the ‘noise’. The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes. 

In [34]:
def evaluate(model_, X_test_, y_test_):
    y_pred = model_.predict(X_test_)

    results = pd.DataFrame({"Accuracy"  : [accuracy_score(y_test_, y_pred)],
                            "Precision" : [precision_score(y_test_, y_pred)],
                            "Recall"    : [recall_score(y_test_, y_pred)],
                            "f1"        : [f1_score(y_test_, y_pred)],
                            "ROC AUC"   : [roc_auc_score(y_test, y_pred)]})
    return results

The model used is `XGBClassifier`: it is an ensemble learning method that combines the predictions of multiple weak models to produce a strong prediction. The weak models in XGBoost are decision trees, which are trained using gradient boosting. This means that at each iteration, the algorithm fits a decision tree to the residuals of the previous iteration.

Once the decision trees have been trained, XGBoost makes predictions by combining the predictions of all the trees using a weighted average. The weights for each tree are learned during training using the same objective function. This allows the algorithm to automatically learn which trees are more important and should be given more weight in the final prediction.

We used `GridSearchCV` for hyper parameter tuning.

In [35]:
parameters = {'n_estimators': [256, 512, 1024], 'max_depth': [6, 8, 12], 'learning_rate': [0.03, 0.1]}
model = xgb.XGBClassifier(n_jobs=-1, random_state=44, eval_metric="mlogloss")
gs = GridSearchCV(model, parameters, scoring='f1', cv=5)
gs.fit(X_train, y_train)

In [36]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.674786,0.117891,0.031723,0.003105,0.03,6,256,"{'learning_rate': 0.03, 'max_depth': 6, 'n_est...",0.252101,0.255765,0.223404,0.218182,0.237288,0.237348,0.014957,18
1,2.9498,0.127508,0.041943,0.001212,0.03,6,512,"{'learning_rate': 0.03, 'max_depth': 6, 'n_est...",0.442991,0.448372,0.398849,0.456984,0.445896,0.438618,0.020426,17
2,5.596678,0.216057,0.072586,0.009777,0.03,6,1024,"{'learning_rate': 0.03, 'max_depth': 6, 'n_est...",0.687648,0.69538,0.653258,0.699454,0.679365,0.683021,0.016391,14
3,2.43828,0.039297,0.039726,0.008148,0.03,8,256,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.512086,0.495928,0.44548,0.485507,0.5125,0.4903,0.024621,16
4,4.379186,0.058519,0.054342,0.002471,0.03,8,512,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.732673,0.722521,0.687598,0.745304,0.729916,0.723603,0.019445,13
5,8.276432,0.045333,0.094783,0.007158,0.03,8,1024,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.897283,0.876861,0.882823,0.897419,0.895542,0.889986,0.00852,9
6,4.810893,0.136408,0.055963,0.010007,0.03,12,256,"{'learning_rate': 0.03, 'max_depth': 12, 'n_es...",0.830986,0.824113,0.816442,0.843206,0.853809,0.833711,0.013355,12
7,8.067745,0.154111,0.088966,0.011399,0.03,12,512,"{'learning_rate': 0.03, 'max_depth': 12, 'n_es...",0.942675,0.921977,0.91521,0.932476,0.934353,0.929338,0.009661,8
8,13.814154,0.112551,0.150713,0.009926,0.03,12,1024,"{'learning_rate': 0.03, 'max_depth': 12, 'n_es...",0.968983,0.956576,0.94598,0.959248,0.956956,0.957548,0.007332,4
9,1.675408,0.062181,0.03367,0.003452,0.1,6,256,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",0.613466,0.657005,0.611804,0.644481,0.643379,0.634027,0.018118,15


In [37]:
gs.best_score_

0.9599708243882745

In [38]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 1024}

In [39]:
evaluate(gs, X_test, y_test)

Unnamed: 0,Accuracy,Precision,Recall,f1,ROC AUC
0,0.998368,1.0,0.983369,0.991615,0.991685


As can be seen, the results on the test set are excellent. As 100% (precision) of the clients predicted to churn did actually churn. 