In [None]:
#standard
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

#sklearn data_preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder,OrdinalEncoder,LabelEncoder
import phik
#sklearn categorical encoding
import category_encoders as ce

#sklearn modelling
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split, KFold, RandomizedSearchCV


#sklearn regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNetCV

#LightGBM
import lightgbm as lgb

#feature slection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV


#warnings
import warnings
warnings.filterwarnings("ignore")

# 1.Introduction

This kernel will provide some very fundamental EDA to help understand better both categorical and numeric features in the dataset. Interesting visualizations including multivariate correlation between categorical variables using PhiK library will be introduced

Then, Recursive Feature Elimination (RFE) technique using Ridge and Lasso regression algorithm will be used to access the importance of each feature.

The categorical features will be encoded using General Linear Mixture Model (an encoder under the target encoder family)

In the Model selection, multiple LightGBM models with different hyperparameters will be compared

Keywords: EDA | PhiK | LightGBM | GLMM encoder | RFE | Feature selection | Model selection

In [None]:
data = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
# Set id as as index
data.set_index("id",inplace=True)
test.set_index("id",inplace=True)

In [None]:
data.describe(include = "all")

In [None]:
data.dtypes

In [None]:
# we have 1 id, 10 categorical variables, 14 continuous variables
cat_feats = data.iloc[:,0:10].columns
numeric_feats = data.iloc[:,10:-1].columns
train = data.iloc[:,:-1]
target = data.iloc[:,-1]

# 2. EDA

## Explore numeric features:

In [None]:
train[numeric_feats]

First of all, let have a look at some basic statistical information such as multivariate correlation, distribution of the variables

In [None]:
def cor_heatmap(cor):
    plt.figure(figsize=(12,10))
    sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.1')
    plt.show()
    
#DO IT
cor_heatmap(train[numeric_feats].corr())    

In [None]:
def displot_all(df, columns):
    #Prepare figure layout
    rows = int(ceil(columns.shape[0]/5))
    sns.set()
    fig, axes = plt.subplots(nrows = rows, ncols=5, figsize=(15,3*rows))

    # Draw the boxplots
    for i in zip(axes.flatten(), columns):
        sns.distplot(x=df.loc[:,i[1]], ax=i[0])      
        i[0].set_title(i[1])
        i[0].set_ylabel("")
        for tick in i[0].get_xticklabels():
            tick.set_rotation(-25)
    # Finalize the plot
    plt.subplots_adjust(wspace=0.5,hspace = 0.5)
    fig.suptitle("Dist plots", fontsize=25)
    sns.despine(bottom=True)
    plt.show()
    
displot_all(train,numeric_feats)    

In [None]:
def scter_all(df, columns, target):
    #Prepare figure layout
    rows = int(ceil(columns.shape[0]/5))
    sns.set()
    fig, axes = plt.subplots(nrows = rows, ncols=5, figsize=(15,3*rows))

    # Draw the boxplots
    for i in zip(axes.flatten(), columns):
        sns.scatterplot(x= i[1], y = target,data=df, ax=i[0])      
        #i[0].set_title(i[1])
        i[0].set_ylabel("")
        for tick in i[0].get_xticklabels():
            tick.set_rotation(-25)
    # Finalize the plot
    plt.subplots_adjust(wspace=0.5,hspace = 0.5)
    fig.suptitle("Dist plots", fontsize=25)
    sns.despine(bottom=True)
    plt.show()
    
scter_all(data ,numeric_feats, "target")    

### Apply feature selection using Lasso and Ridge

In [None]:
def plot_importance(coef, name, ax):
    imp_coef = coef.sort_values()
    ax.bar(imp_coef)
    ax.title("Feature importance using " + name + " Model")
    
fig, axes = plt.subplots(nrows = 1, ncols = 2, sharex="all", figsize=(10,6))
models = [LassoCV(), RidgeCV()]
names = ["Lasso","Ridge"]
for model, name, ax in zip(models, names, axes.flatten()):
    reg = model
    reg.fit(X = train[numeric_feats], y = target)
    #DO IT
    coef = pd.Series(reg.coef_, index  = train[numeric_feats].columns)
    imp_coef = coef.sort_values()
    g = sns.barplot(x = imp_coef.values, y = imp_coef.index, orient='h', ax = ax,color='Blue')
    ax.set_title("Feature importance using " + name + " Model")

### Perform RFE to find the optimum number of features

In [None]:
#no of features
nof_list=np.arange(1,13)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    # we are going to see in the next class this "train_test_split()"...
    X_train, X_test, y_train, y_test = train_test_split(train[numeric_feats],target, test_size = 0.3, random_state = 0)
    
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

## Categorical features

Same processes of EDA as numeric features

**Apply Phi_K to measure correlation coeffictient of all features together:**

Phi_K is a new and practical correlation coefficient based on several refinements to Pearson’s hypothesis test of independence of two variables.


In [None]:
phik = train[cat_feats].phik_matrix()

mask = np.zeros_like(phik, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(10, 15)) 
heatmap = sns.heatmap(phik, 
                      square = True,
                      mask = mask,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .6, 
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      fmt='.2g',
                      vmin = -1, 
                      vmax = 1,
                      annot = True,
                      annot_kws = {'size': 10})
#add the column names as labels
ax.set_yticklabels(phik.columns, rotation = 0)
ax.set_xticklabels(phik.columns)
sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

In [None]:
def countplot_all(df, columns):
    #Prepare figure layout
    rows = int(ceil(columns.shape[0]/5))
    sns.set()
    fig, axes = plt.subplots(nrows = rows, ncols=5, figsize=(15,3*rows))

    # Draw the boxplots
    for i in zip(axes.flatten(), columns):
        sns.countplot(x=df.loc[:,i[1]], ax=i[0])      
        i[0].set_title(i[1])
        i[0].set_ylabel("")
        for tick in i[0].get_xticklabels():
            tick.set_rotation(-25)
    # Finalize the plot
    plt.subplots_adjust(wspace=0.5,hspace = 0.5)
    fig.suptitle("Dist plots", fontsize=25)
    sns.despine(bottom=True)
    plt.show()
    
countplot_all(train,cat_feats)    

In [None]:
def displot_all_hue(df, columns):
    #Prepare figure layout
    rows = int(ceil(columns.shape[0]/3))
    sns.set()
    fig, axes = plt.subplots(nrows = rows, ncols=3, figsize=(15,5*rows))

    # Draw the boxplots
    for i in zip(axes.flatten(), columns):
        sns.histplot(x='target',hue = i[1] ,data = df, ax=i[0])      
        i[0].set_title(i[1])
        i[0].set_ylabel("")
        for tick in i[0].get_xticklabels():
            tick.set_rotation(-25)
    # Finalize the plot
    plt.subplots_adjust(wspace=0.5,hspace = 0.5)
    fig.suptitle("Dist plots", fontsize=25)
    sns.despine(bottom=True)
    plt.show()
    
displot_all_hue(data, cat_feats)    


# 3. Data processing

In this step, we will create a pipeline to scale and encode the features using StandardScaler and GLMM encoder

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3,  random_state=2)


def glmmEncode_Scaler_pipeline(X_train, y_train, X_test, scaler = MinMaxScaler()): 
    X_train_encoded = X_train.copy()
    X_test_encoded= X_test.copy()
    # Set up feature to encode
    feature_to_encode = X_train.columns[X_train.dtypes == 'O'].tolist()
    # Initia the encoder model
    GLMMEncoder = ce.glmm.GLMMEncoder(binomial_target=False)
    # fit the train data
    GLMMEncoder.fit(X_train[feature_to_encode],y_train)

    # transform training set
    X_train_encoded[feature_to_encode] = GLMMEncoder.transform(X_train[feature_to_encode])
    # transform test set
    X_test_encoded[feature_to_encode] = GLMMEncoder.transform(X_test[feature_to_encode])

    # setup MINMAXSCALER
    scaler = StandardScaler()
    # fit the scaler                    
    scaler.fit(X_train_encoded)
    # transform training set
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_encoded), columns=X_train_encoded.columns, index=X_train_encoded.index)
    # transform test set
    X_test_scaled = pd.DataFrame(scaler.transform(X_test_encoded), columns=X_test_encoded.columns, index=X_test_encoded.index)
    # store back
    return (X_train_scaled, X_test_scaled)

    
X_train, X_test =  glmmEncode_Scaler_pipeline(X_train, y_train, X_test)
training_dataset, test_dataset= glmmEncode_Scaler_pipeline(train,target, test)

In [None]:
print('Train set:', X_train.shape)
print('Test set:', X_test.shape)
print('Whole training set set:', training_dataset.shape)
print('Whole test set:', test_dataset.shape)

In [None]:
   
fig, axes = plt.subplots(nrows = 1, ncols = 2, sharex="all", figsize=(10,6))
models = [LassoCV(), RidgeCV()]
names = ["Lasso","Ridge"]
for model, name, ax in zip(models, names, axes.flatten()):
    reg = model
    reg.fit(X = training_dataset[cat_feats], y = target)
    #DO IT
    coef = pd.Series(reg.coef_, index  = training_dataset[cat_feats].columns)
    imp_coef = coef.sort_values()
    g = sns.barplot(x = imp_coef.values, y = imp_coef.index, orient='h', ax = ax,color='Blue')
    ax.set_title("Feature importance using " + name + " Model")

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 2, sharex="all", figsize=(10,6))
models = [LassoCV(), RidgeCV()]
names = ["Lasso","Ridge"]
for model, name, ax in zip(models, names, axes.flatten()):
    reg = model
    reg.fit(X = training_dataset[numeric_feats], y = target)
    #DO IT
    coef = pd.Series(reg.coef_, index  = training_dataset[numeric_feats].columns)
    imp_coef = coef.sort_values()
    g = sns.barplot(x = imp_coef.values, y = imp_coef.index, orient='h', ax = ax,color='Blue')
    ax.set_title("Feature importance using " + name + " Model")

# 4. Model selection

In [None]:
np.concatenate((cat_feats.drop('cat4').values,numeric_feats.drop(['cont2','cont3']).values))

In [None]:
indx = y_train[y_train != 0].index

#columns_select = np.concatenate((cat_feats.drop('cat4').values,numeric_feats.drop(['cont2','cont3']).values))
columns_select = np.concatenate((cat_feats.values, numeric_feats.values))
ran_state = 2

models = []
#Logistic Regression

params = {'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 0.005,
 'max_depth': 5,
 'num_leaves': 30,
 'min_child_samples': 285,
 'random_state': 2,
'verbose':-1,
 'n_estimators': 15000,
 'metric': 'rmse',
 'cat_smooth': 39}

models.append(('LightGBM', lgb.LGBMRegressor(boosting_type='gbdt',random_state = 2)))
models.append(('LightGBM_tuned',lgb.LGBMRegressor(boosting_type='gbdt',**params)))
# evalutate each model in turn
results = []
names = []
scores = {}
model = lgb.LGBMRegressor(boosting_type='gbdt',random_state = 2)

#First, measure the base-line model with full features
begin = time.perf_counter()
model.fit(X_train.loc[:,:], y_train[:])
# Run cross-validation on training set
train_score = np.sqrt(mean_squared_error(y_train, model.predict(X_train.loc[:,:])))
test_score = np.sqrt(mean_squared_error(y_test, model.predict(X_test.loc[:,:])))
# finish counting time
end = time.perf_counter()
names.append('baseline')
# assign infomation
scores['baseline'] = [train_score, test_score ,round(end-begin,3)]

# Now, try on selected features
for name, model in models:
    # start counting time
    begin = time.perf_counter()
    model.fit(X_train.loc[indx,:], y_train[indx])
    # Run cross-validation on training set
    train_score = np.sqrt(mean_squared_error(y_train, model.predict(X_train.loc[:,:])))
    test_score = np.sqrt(mean_squared_error(y_test, model.predict(X_test.loc[:,:])))
    # finish counting time
    end = time.perf_counter()
    names.append(name)
    # assign infomation
    scores[name] = [train_score, test_score ,round(end-begin,3)]

final_score = pd.DataFrame.from_dict(scores, orient='index',columns=['Train','Test','Training time'])
final_score

**Now, let's use the library skopt to tune the parameter of the estimator LightGBM**

In [None]:
# Adapted from: https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html

from skopt.space import Real, Integer
from skopt.utils import use_named_args


# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [Integer(1, 10, name='max_depth'),          
          Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
          Integer(2, 100, name='num_leaves')
         ]
reg = lgb.LGBMRegressor(boosting_type='gbdt',                        
                        n_estimators=100,
                        random_state = 2
                        )
# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    reg.set_params(**params)
    return -np.mean(cross_val_score(reg, training_dataset, target, cv=6, n_jobs=-1,
                                    scoring="neg_mean_squared_error"))

In [None]:
from skopt import forest_minimize
res_gp = forest_minimize(objective, space, n_calls=50, random_state=0)

"Best score=%.4f" % res_gp.fun

In [None]:
res_gp.x

## Submission

In [None]:
params = {'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 0.0045,
 'max_depth': 6,
 'num_leaves': 35,
 'min_child_samples': 285,
 'random_state': 2,
'verbose':-1,
 'n_estimators': 15000,
 'metric': 'rmse',
 'cat_smooth': 39}

In [None]:
model = lgb.LGBMRegressor(boosting_type='gbdt',**params)
model.fit(training_dataset, target)
y_predicted = model.predict(test_dataset)

submission = pd.DataFrame({'id': test_dataset.index, 'target': y_predicted})
filename = 'predict_GB_GLMM_tunned.csv'
submission.to_csv(filename, index=False)