# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

---
## 2. Load data

In [2]:
client_df = pd.read_csv('client_data_after_eda.csv')

In [3]:
client_df.head(3)

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,t,0.0,25.44,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,f,0.0,16.38,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.8,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,f,0.0,28.6,28.6,1,6.6,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,0


In [4]:
price_df = pd.read_csv('price_data_after_eda.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head(3)

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0


In [5]:
price_df.shape

(193002, 8)

---

## 3. Feature engineering

### Difference between off-peak prices in December and preceding January

Below is the code created by your colleague to calculate the feature described above. Use this code to re-create this feature and then think about ways to build on this feature to create features with a higher predictive power.

In [6]:
# Group prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).mean().reset_index()
monthly_price_by_id

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,0002203ffbb812588b632b9e628cc38d,2015-01-01,0.126098,0.103975,0.070232,40.565969,24.339581,16.226389
1,0002203ffbb812588b632b9e628cc38d,2015-02-01,0.126098,0.103975,0.070232,40.565969,24.339581,16.226389
2,0002203ffbb812588b632b9e628cc38d,2015-03-01,0.128067,0.105842,0.073773,40.728885,24.437330,16.291555
3,0002203ffbb812588b632b9e628cc38d,2015-04-01,0.128067,0.105842,0.073773,40.728885,24.437330,16.291555
4,0002203ffbb812588b632b9e628cc38d,2015-05-01,0.128067,0.105842,0.073773,40.728885,24.437330,16.291555
...,...,...,...,...,...,...,...,...
192997,ffff7fa066f1fb305ae285bb03bf325a,2015-08-01,0.119916,0.102232,0.076257,40.728885,24.437330,16.291555
192998,ffff7fa066f1fb305ae285bb03bf325a,2015-09-01,0.119916,0.102232,0.076257,40.728885,24.437330,16.291555
192999,ffff7fa066f1fb305ae285bb03bf325a,2015-10-01,0.119916,0.102232,0.076257,40.728885,24.437330,16.291555
193000,ffff7fa066f1fb305ae285bb03bf325a,2015-11-01,0.119916,0.102232,0.076257,40.728885,24.437330,16.291555


In [7]:
# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff = dec_prices.iloc[:,2:] - jan_prices.iloc[:,2:]

# rename the columns
col_list = ['diff_'+col for col in diff.columns]
diff.columns = col_list
diff.insert(loc=0, column='id', value=dec_prices.id)

In [8]:
# Also calculate the mean prices & variance in prices for the period  

mean_prices = monthly_price_by_id.groupby('id').mean().reset_index().iloc[:,1:]
col_list = ['mean_'+col for col in mean_prices.columns]
mean_prices.columns = col_list

var_prices = monthly_price_by_id.groupby('id').var().reset_index().iloc[:,1:]
col_list = ['var_'+col for col in var_prices.columns]
var_prices.columns = col_list

In [9]:
mean_var = pd.concat([mean_prices, var_prices], axis=1)
price_df = pd.concat([diff, mean_var], axis=1)
price_df

Unnamed: 0,id,diff_price_off_peak_var,diff_price_peak_var,diff_price_mid_peak_var,diff_price_off_peak_fix,diff_price_peak_fix,diff_price_mid_peak_fix,mean_price_off_peak_var,mean_price_peak_var,mean_price_mid_peak_var,mean_price_off_peak_fix,mean_price_peak_fix,mean_price_mid_peak_fix,var_price_off_peak_var,var_price_peak_var,var_price_mid_peak_var,var_price_off_peak_fix,var_price_peak_fix,var_price_mid_peak_fix
0,0002203ffbb812588b632b9e628cc38d,-0.006192,-0.002302,0.003487,0.162916,0.097749,0.065166,0.124338,0.103794,0.073160,40.701732,24.421038,16.280694,0.000016,0.000004,1.871602e-06,4.021438e-03,0.001448,0.000643
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.000000,0.000000,0.177779,0.000000,0.000000,0.146426,0.000000,0.000000,44.385450,0.000000,0.000000,0.000005,0.000000,0.000000e+00,7.661891e-03,0.000000,0.000000
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,0.000000,0.000000,1.500000,0.000000,0.000000,0.181558,0.000000,0.000000,45.319710,0.000000,0.000000,0.000676,0.000000,0.000000e+00,5.965909e-01,0.000000,0.000000
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,-0.005120,0.000763,0.162916,0.097749,0.065166,0.118757,0.098292,0.069032,40.647427,24.388455,16.258971,0.000025,0.000007,1.627620e-07,7.238536e-03,0.002606,0.001158
4,00114d74e963e47177db89bc70108537,-0.003994,0.000000,0.000000,-0.000001,0.000000,0.000000,0.147926,0.000000,0.000000,44.266930,0.000000,0.000000,0.000005,0.000000,0.000000e+00,3.490909e-13,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16091,ffef185810e44254c3a4c6395e6b4d8a,-0.050232,-0.038788,-0.022735,-0.335085,-0.400251,-0.432834,0.138863,0.115125,0.080780,40.896427,24.637456,16.507972,0.000688,0.000422,1.563148e-04,3.062232e-02,0.043691,0.051094
16092,fffac626da707b1b5ab11e8431a4d0a2,-0.003778,0.000000,0.000000,0.177779,0.000000,0.000000,0.147137,0.000000,0.000000,44.311375,0.000000,0.000000,0.000004,0.000000,0.000000e+00,6.464760e-03,0.000000,0.000000
16093,fffc0cacd305dd51f316424bbb08d1bd,-0.001760,-0.003707,-0.007326,0.164916,0.099749,0.067166,0.153879,0.129497,0.094842,41.160171,24.895768,16.763569,0.000009,0.000006,1.857770e-05,7.211360e-03,0.002638,0.001196
16094,fffe4f5646aa39c7f97f95ae2679ce64,-0.009391,-0.004937,0.001029,0.162916,0.097749,0.065166,0.123858,0.103499,0.073735,40.606699,24.364017,16.242678,0.000021,0.000006,2.220744e-07,5.428835e-03,0.001954,0.000869


In [10]:
price_df.isnull().sum()

id                         0
diff_price_off_peak_var    0
diff_price_peak_var        0
diff_price_mid_peak_var    0
diff_price_off_peak_fix    0
diff_price_peak_fix        0
diff_price_mid_peak_fix    0
mean_price_off_peak_var    0
mean_price_peak_var        0
mean_price_mid_peak_var    0
mean_price_off_peak_fix    0
mean_price_peak_fix        0
mean_price_mid_peak_fix    0
var_price_off_peak_var     0
var_price_peak_var         0
var_price_mid_peak_var     0
var_price_off_peak_fix     0
var_price_peak_fix         0
var_price_mid_peak_fix     0
dtype: int64

In [11]:
client_df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,t,0.0,25.44,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,f,0.0,16.38,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.8,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,f,0.0,28.6,28.6,1,6.6,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,0
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,f,0.0,30.22,30.22,1,25.46,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,0
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,f,52.32,44.91,44.91,1,47.98,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,19.8,0


In [12]:
# Do One-Hot encoding for channel_sales and origin_up

dummies_channel_sales = pd.get_dummies(client_df['channel_sales'])

dummies_origin_up = pd.get_dummies(client_df['origin_up'])

In [13]:
dummies_channel_sales.drop('MISSING', 1, inplace=True)

# simplifying column names

col_list = ['channel_'+col[:4] for col in dummies_channel_sales.columns]
dummies_channel_sales.columns = col_list
dummies_channel_sales

Unnamed: 0,channel_epum,channel_ewpa,channel_fixd,channel_foos,channel_lmke,channel_sddi,channel_usil
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
14601,0,0,0,1,0,0,0
14602,0,0,0,1,0,0,0
14603,0,0,0,1,0,0,0
14604,0,0,0,1,0,0,0


In [14]:
dummies_origin_up.drop('MISSING', 1, inplace=True)

# simplifying column names

col_list = ['origin_'+col[:4] for col in dummies_origin_up.columns]
dummies_origin_up.columns = col_list
dummies_origin_up


client_df.drop(['channel_sales','origin_up'], axis=1, inplace=True)

Unnamed: 0,origin_ewxe,origin_kamk,origin_ldks,origin_lxid,origin_usap
0,0,0,0,1,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
14601,0,0,0,1,0
14602,0,0,0,1,0
14603,0,0,0,1,0
14604,0,0,0,1,0


In [17]:
client_df.has_gas.replace({'f':0,'t':1}, inplace=True)

# convert data columns into just year data

client_df["date_activ"] = pd.to_datetime(client_df["date_activ"], format='%Y-%m-%d')
client_df["date_end"] = pd.to_datetime(client_df["date_end"], format='%Y-%m-%d')
client_df["date_modif_prod"] = pd.to_datetime(client_df["date_modif_prod"], format='%Y-%m-%d')
client_df["date_renewal"] = pd.to_datetime(client_df["date_modif_prod"], format='%Y-%m-%d')

for col in ['date_activ','date_end','date_modif_prod','date_renewal']:
    client_df[col] = pd.DatetimeIndex(client_df[col]).year

In [21]:
client_df = pd.concat([client_df, dummies_channel_sales, dummies_origin_up], axis=1)
client_df

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,...,channel_fixd,channel_foos,channel_lmke,channel_sddi,channel_usil,origin_ewxe,origin_kamk,origin_ldks,origin_lxid,origin_usap
0,24011ae4ebbe3035111d65fa7c15bc57,0,54946,0,2013,2016,2015,2015,0.00,0,...,0,1,0,0,0,0,0,0,1,0
1,d29c2c54acc38ff3c0614d0a653813dd,4660,0,0,2009,2016,2009,2015,189.95,0,...,0,0,0,0,0,0,1,0,0,0
2,764c75f661154dac3a6c254cd082ea7d,544,0,0,2010,2016,2010,2015,47.96,0,...,0,1,0,0,0,0,1,0,0,0
3,bba03439a292a1e166f80264c16191cb,1584,0,0,2010,2016,2010,2015,240.04,0,...,0,0,1,0,0,0,1,0,0,0
4,149d57cf92fc41cf94415803a877cb4b,4425,0,526,2010,2016,2010,2015,445.75,526,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14601,18463073fb097fc0ac5d3e040f356987,32270,47940,0,2012,2016,2015,2014,4648.01,0,...,0,1,0,0,0,0,0,0,1,0
14602,d0a6f71671571ed83b2645d23af6de00,7223,0,181,2012,2016,2012,2015,631.69,181,...,0,1,0,0,0,0,0,0,1,0
14603,10e6828ddd62cbcf687cb74928c4c2d2,1844,0,179,2012,2016,2012,2015,190.39,179,...,0,1,0,0,0,0,0,0,1,0
14604,1cf20fd6206d7678d5bcafd28c53b4db,131,0,0,2012,2016,2012,2015,19.34,0,...,0,1,0,0,0,0,0,0,1,0


In [22]:
# Finally merge it with price features obtained earlier

client_prices = pd.merge(client_df, price_df, how='left', on='id')
client_prices

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,...,mean_price_mid_peak_var,mean_price_off_peak_fix,mean_price_peak_fix,mean_price_mid_peak_fix,var_price_off_peak_var,var_price_peak_var,var_price_mid_peak_var,var_price_off_peak_fix,var_price_peak_fix,var_price_mid_peak_fix
0,24011ae4ebbe3035111d65fa7c15bc57,0,54946,0,2013,2016,2015,2015,0.00,0,...,0.066530,40.942265,22.352010,14.901340,0.000061,2.627605e-05,4.402763e-04,1.102785e+00,49.550703,22.022535
1,d29c2c54acc38ff3c0614d0a653813dd,4660,0,0,2009,2016,2009,2015,189.95,0,...,0.000000,44.311375,0.000000,0.000000,0.000005,6.089453e-04,0.000000e+00,6.464779e-03,0.000000,0.000000
2,764c75f661154dac3a6c254cd082ea7d,544,0,0,2010,2016,2010,2015,47.96,0,...,0.000000,44.385450,0.000000,0.000000,0.000006,2.558511e-07,0.000000e+00,7.661891e-03,0.000000,0.000000
3,bba03439a292a1e166f80264c16191cb,1584,0,0,2010,2016,2010,2015,240.04,0,...,0.000000,44.400265,0.000000,0.000000,0.000005,0.000000e+00,0.000000e+00,6.464721e-03,0.000000,0.000000
4,149d57cf92fc41cf94415803a877cb4b,4425,0,526,2010,2016,2010,2015,445.75,526,...,0.072865,40.688156,24.412893,16.275263,0.000015,3.552481e-06,2.522046e-06,5.428942e-03,0.001954,0.000869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14601,18463073fb097fc0ac5d3e040f356987,32270,47940,0,2012,2016,2015,2014,4648.01,0,...,0.000000,44.370635,0.000000,0.000000,0.000015,0.000000e+00,0.000000e+00,8.380216e-03,0.000000,0.000000
14602,d0a6f71671571ed83b2645d23af6de00,7223,0,181,2012,2016,2012,2015,631.69,181,...,0.070817,59.015674,36.393379,8.345418,0.000013,3.472391e-06,1.638788e-08,1.358132e-02,0.005165,0.000272
14603,10e6828ddd62cbcf687cb74928c4c2d2,1844,0,179,2012,2016,2012,2015,190.39,179,...,0.073160,40.701732,24.421038,16.280694,0.000016,3.957295e-06,1.871602e-06,4.021438e-03,0.001448,0.000643
14604,1cf20fd6206d7678d5bcafd28c53b4db,131,0,0,2012,2016,2012,2015,19.34,0,...,0.000000,44.311375,0.000000,0.000000,0.000005,6.089453e-04,0.000000e+00,6.464769e-03,0.000000,0.000000


In [16]:
# OUTLIER REMOVAL

# using z score method
# from scipy.stats import zscore

# using iqr method

q1 = client_prices.iloc[:,2:].quantile(0.25)
q3 = client_prices.iloc[:,2:].quantile(0.75)
iqr = q3-q1

client_prices_out = client_prices[~((client_prices.iloc[:,2:] < q1-1.5*iqr) | (client_prices.iloc[:,2:] > q3+1.5*iqr)).any(axis=1)]
client_prices_out.isnull().sum()

NameError: name 'STOP' is not defined

In [23]:
# client_prices_out.groupby('churn').mean()

client_prices.dtypes

id                                 object
cons_12m                            int64
cons_gas_12m                        int64
cons_last_month                     int64
date_activ                          int64
date_end                            int64
date_modif_prod                     int64
date_renewal                        int64
forecast_cons_12m                 float64
forecast_cons_year                  int64
forecast_discount_energy          float64
forecast_meter_rent_12m           float64
forecast_price_energy_off_peak    float64
forecast_price_energy_peak        float64
forecast_price_pow_off_peak       float64
has_gas                             int64
imp_cons                          float64
margin_gross_pow_ele              float64
margin_net_pow_ele                float64
nb_prod_act                         int64
net_margin                        float64
num_years_antig                     int64
pow_max                           float64
churn                             

## MODELLING

#### Random Forest Classifier (RFCs) work on the principle of bagging of decision tree models. It aggregates the results of multiple DT models working on different subsets of data (bagging)
#### Advantages of using a Random Forest Classifier 
#### 1) It is not dependent on encoding of categorical features or scaling of numerical features
#### 2) It is less sensitive to outliers and non-linearity in features
#### 3) It aggregates the results of several models, leading to an overall better scores wit

In [24]:
X = client_prices.drop(['id','churn'], 1)
y = client_prices['churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=10)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(10224, 52) (4382, 52)
(10224,) (4382,)


In [25]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9222
           1       1.00      1.00      1.00      1002

    accuracy                           1.00     10224
   macro avg       1.00      1.00      1.00     10224
weighted avg       1.00      1.00      1.00     10224



In [27]:
print(classification_report(y_test, y_test_pred))

# Although our model has accurately predicted the 0's (No churn), the recall for 1's (churned) is very low
# Overfitting is present as the model precision & recall (1's) for test data is only 8% and 3% (very low)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3965
           1       0.78      0.05      0.09       417

    accuracy                           0.91      4382
   macro avg       0.84      0.52      0.52      4382
weighted avg       0.90      0.91      0.87      4382



In [30]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(rfc, X_train, y_train, scoring='recall', cv=5)
scores.mean()

0.03793034825870647

In [None]:
# We can use the recall rate for churn = 1 as an evaluation metric for our model
# This is because we need the model to correctly as many of the clients that are likely to churn as possible
# Since our dataset is so imbalanced, we expect the model to correctly predict many of the 0's even if its a poor one
# Hence, we n## MODEL FINETUNINGeed to use the recall and precision for 1's as the evaluation metric for our model.

## MODEL INTERPRETATION

In [None]:
client_info_columns = ['id','channel_sales', 'origin_up', 'nb_prod_act','num_years_antig','has_gas']
past_cons_columns = ['cons_12m', 'cons_gas_12m', 'cons_last_month']
date_columns = [ 'date_activ','date_end', 'date_modif_prod', 'date_renewal']
forecast_columns = ['forecast_cons_12m', 'forecast_cons_year', 'forecast_price_energy_off_peak', 'forecast_price_pow_off_peak',
                    'forecast_price_energy_peak','forecast_discount_energy','forecast_meter_rent_12m']
present_cons_columns = ['imp_cons','pow_max', 'margin_gross_pow_ele', 'margin_net_pow_ele','net_margin']

In [None]:
important_features = pd.DataFrame({'Features': X_train.columns, 'Importance': rfc.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

## MODEL FINETUNING

In [None]:
from sklearn.model_selection import GridSearchCV
# params = [{'criterion': ['entropy', 'gini'],
#                      'n_estimators': [10, 30, 50, 70, 90],
#                      'max_depth': [10, 15, 20],
# #                      'max_features': ['sqrt', 'log2'],
#                      'min_samples_split': [2, 5, 8, 11],
#                      'min_samples_leaf': [1, 5, 9],
#                      'max_leaf_nodes': [2, 5, 8, 11]}]

params = [{'n_estimators': range(10,100,10)}]
random_forest_classification = RandomForestClassifier(random_state = 10)
rf_grid = GridSearchCV(estimator = random_forest_classification, param_grid = params, cv = 5)
rf_grid_model = rf_grid.fit(X_train, y_train)
print('Best parameters for random forest classifier: ', rf_grid_model.best_params_, '\n')

# Best parameters for random forest classifier:  
# {'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10} 

In [None]:
# Model with chosen optimum hyperparameters

rfc_best = RandomForestClassifier(criterion = 'entropy',  max_depth = 10, max_leaf_nodes = 2, min_samples_leaf = 1, 
                              min_samples_split = 2, n_estimators = 10)
rfc_best.fit(X_train, y_train)
y_train_pred = rfc_best.predict(X_train)
y_test_pred = rfc_best.predict(X_test)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
for col in forecast_columns:
    client_df[col].plot.hist(bins=50)
    plt.xlabel(col)
    plt.show()

In [None]:
client_df['imp_cons'].plot.hist(bins=50)