In [None]:
Some minor changes

# <font size="+3" color=Blue ><b> <center><u>HR Analysis, Prediction and Visualization and Ensemble Model </u></center></b></font><br><a id="top"></a>

# <font size="+2" color=red ><b> <center><u>Using CAT BOOST XGBOOST and LGBM </u></center></b></font><br><a id="top"></a>

![](https://blog.walkme.com/wp-content/uploads/2019/07/2.jpg)

In [None]:
# Warning Libraries :
import warnings
warnings.filterwarnings("ignore")

# Scientific and Data Manipulation Libraries :
import pandas as pd
import numpy as np
import math
import gc
import os

# ML Libraries :
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
from sklearn.model_selection          import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.tree                     import DecisionTreeClassifier
from sklearn.ensemble                 import VotingClassifier, RandomForestClassifier
from sklearn.metrics                  import f1_score, confusion_matrix, classification_report

                    
# Data Visualization Libraries :
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

In [None]:
rs=1331 ##random_state

# Loading the Data

In [None]:
train= pd.read_csv('../input/hranalysis/train.csv')
test= pd.read_csv('../input/hranalysis/test.csv')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
print(train.columns)
print("*"*100)
print(test.columns)

In [None]:
print("Train data shape",train.shape)
print("Test data shape",test.shape)

In [None]:
train.info()

In [None]:
train.describe(include='all')

In [None]:
test.info()

In [None]:
test.describe(include='all')

In [None]:
train.isna().sum()

## Missing value Analysis 

In [None]:
#Using missingno to visualize null values in train data
import missingno as msno
msno.bar(train, color = '#6389df', figsize = (10,8))  


In [None]:
test.isna().sum()

In [None]:
#Using missingno to visualize null values in test data
msno.bar(test, color = '#6389df', figsize = (10,8))  


### Correlation between features through Heatmap

In [None]:
corr=train.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,square=True,annot=True)

In [None]:
train.dtypes

## UNIVARIATE ANALYSIS 

In [None]:
# Let’s plot the distribution of each feature
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure(figsize=(width,height))
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
    rows = math.ceil(float(dataset.shape[1]) / cols)
    for i, column in enumerate(dataset.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if dataset.dtypes[column] == np.object:
            g = sns.countplot(y=column, data=dataset)
            substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
            g.set(yticklabels=substrings)
            plt.xticks(rotation=25)
        else:
            g = sns.distplot(dataset[column])
            plt.xticks(rotation=25)
    
plot_distribution(train,cols=2, width=30, height=60, hspace=0.45, wspace=0.5)

## BIVARIATE ANALYSIS

In [None]:

def plot_bivariate_bar(dataset, hue, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
    dataset = dataset.select_dtypes(include=[np.object])
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure(figsize=(width,height))
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
    rows = math.ceil(float(dataset.shape[1]) / cols)
    for i, column in enumerate(dataset.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if dataset.dtypes[column] == np.object:
            g = sns.countplot(y=column, hue=hue, data=dataset)
            substrings = [s.get_text()[:10] for s in g.get_yticklabels()]
            g.set(yticklabels=substrings)
            
            
            
plot_bivariate_bar(train, hue=train.is_promoted, cols=1, width=10, height=35, hspace=0.4, wspace=0.5)

In [None]:
#unique value in education feature
train.education.value_counts()

In [None]:
#plotting a pie chart
size = [36669,14925,805]
label=["Bachelor's","Master's & above",'Below Secondary']
color=['#6389df','#1f2b6c','#a3ccf4']
explode = [0.1, 0.2 , 0.3]
plt.figure(figsize=(8,8))
plt.pie(size,labels=label,colors=color,explode=explode,shadow=True,autopct="%.1f%%")
plt.title("Pie Chart of the Employees Degrees", fontsize = 20)
plt.axis('off')
plt.legend(title='Education Degrees')
plt.show()

In [None]:
#unique value in gender feature
train.gender.value_counts()

In [None]:
#plotting a pie chart
size = [38496,16312]
label=["Male","Female"]
color=['#6389df','#1f2b6c']
explode = [0.1, 0.2 ]
plt.figure(figsize=(8,8))
plt.pie(size,labels=label,colors=color,explode=explode,shadow=True,autopct="%.1f%%")
plt.title("Pie Chart of the GenderGap", fontsize = 20)
plt.axis('off')
plt.legend(title='Gender')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.countplot(x = 'education', data = train, hue = 'gender', palette = 'Paired')
plt.title('Showing Degree & Gender ratio', fontsize = 20)
plt.show()

In [None]:
train['recruitment_channel'].value_counts()

In [None]:
size=[30446,23220,1142]
label=["Other","Sourcing",'Referred']
color=['#6389df','#1f2b6c','#a3ccf4']
explode=[.05,.05,.05]
plt.figure(figsize=(8,8))
plt.pie(size,labels=label,colors=color, startangle=90,shadow=True,autopct="%.2f%%",pctdistance=.85)

center_circle=plt.Circle((0,0),.7,fc='white')
fig=plt.gcf()
fig.gca().add_artist(center_circle)

plt.title('A Pie Chart Representing Recruitment_Channel', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()


In [None]:
#### Check most popular department
from wordcloud import WordCloud
from wordcloud import STOPWORDS

stopword = set(STOPWORDS)

wordcloud = WordCloud(stopwords = stopword).generate(str(train['department']))

plt.rcParams['figure.figsize'] = (15, 8)
print(wordcloud)
plt.imshow(wordcloud)
plt.title('Most Popular Departments', fontsize = 30)
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.distplot(train['age'],color='#6389df')
plt.title('Distribution of Age of Employees', fontsize = 30)
plt.grid(axis='both')
plt.show()

In [None]:
#pie chart for the KPIs_met
train['KPIs_met >80%'].value_counts()

In [None]:
size = [35517, 19291]
labels = "Not Met KPI > 80%", "Met KPI > 80%"
color=['#6389df','#a3ccf4']
explode = [0, 0.1]
plt.figure(figsize=(8,8))
plt.pie(size, labels = labels, colors = color, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing Gap in Employees in terms of KPI', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
train['awards_won?'].value_counts()

In [None]:
size = [53538,1270]
labels = "Awards Won", "NO Awards Won"
color=['#6389df','#a3ccf4']
explode = [0, 0.1]

my_circle = plt.Circle((0, 0), 0.7, color = 'white')

plt.figure(figsize=(8,8))
plt.pie(size, labels = labels, colors = color, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing Gap in Employees in terms of KPI', fontsize = 30)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()
plt.show()

In [None]:
size = [50140, 4668]
labels = "NOT Promoted ", "Promoted "
color=['#a3ccf4','#6389df']
explode = [0, 0.1]

#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, labels = labels, colors = color, explode = explode, shadow =False, autopct = "%.2f%%",startangle=180)
plt.title('Showing a Percentage of employees who Promoted ' , fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
## BIVARIATE FEATURE ANALYSIS

In [None]:
train.columns

In [None]:
a=['department', 'region', 'education', 'gender', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?']
for i in a:
  data = pd.crosstab(train[i],train['is_promoted'])
  data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (15, 5), color = ['#a3ccf4','#6389df'])

plt.legend()
plt.show()

In [None]:
import plotly.express as px
fig = px.parallel_categories(train[['department','education','gender','previous_year_rating','KPIs_met >80%',
                                    'recruitment_channel',
                                   'is_promoted']], 
                             color="is_promoted", 
                             color_continuous_scale=px.colors.sequential.Aggrnyl  )
fig.show()

## 3 DATA CLEANING 

### CHECKING DUPLICATES AND REMOVAL

In [None]:
#  Removes Data Duplicates while Retaining the First one - Similar to SQL DISTINCT :
def remove_duplicate(data):
    
    print("BEFORE REMOVING DUPLICATES - No. of Rows = ",data.shape[0])
    data.drop_duplicates(keep="first", inplace=True) 
    print("AFTER REMOVING DUPLICATES  - No. of Rows = ",data.shape[0])
    return "Checked Duplicates"
# Remove Duplicates from "train" data :
remove_duplicate(train)
# No Duplicates at all !!!

### Checking Missing Value

In [None]:
##missing value function which return an dataframe with total null values and percentage
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(train)

In [None]:
missing_data(test)

In [None]:
train.previous_year_rating=train.previous_year_rating.fillna(0)
test.previous_year_rating=test.previous_year_rating.fillna(0)

* `ffill is used to forward fill the missing values in the dataset - https://www.geeksforgeeks.org/python-pandas-dataframe-ffill/`

* `bfill is used to backward fill the missing values in the dataset - https://www.geeksforgeeks.org/python-pandas-dataframe-bfill/`

In [None]:
train['Fresher']=train['previous_year_rating'].apply(lambda x: 'Fresher' if x==0 else 'Experienced')
test['Fresher']=test['previous_year_rating'].apply(lambda x: 'Fresher' if x==0 else 'Experienced')

In [None]:
train['education']=train['education'].ffill(axis=0)
train['education']=train['education'].bfill(axis=0)

test['education']=test['education'].ffill(axis=0)
test['education']=test['education'].bfill(axis=0)

In [None]:
display(missing_data(train))
display(missing_data(test))


## FEATURE ENGINEERING

In [None]:
#BINNING THE AGE FEATURE IN 20-29 , 29-39 , 39-49 
sns.distplot(train['age'])

train['age'] = pd.cut( x=train['age'], bins=[20, 29, 39, 49], labels=['20', '30', '40'] )
test['age']  = pd.cut( x=test['age'], bins=[20, 29, 39, 49],  labels=['20', '30', '40'] )

In [None]:
train.age.value_counts(dropna=False)

In [None]:
train.drop(['employee_id'],axis=1,inplace=True)
test_d=test

In [None]:
test_d.drop(['employee_id'],axis=1,inplace=True)

## Encoding 
`Converting the categorical features into binary or numerical counterparts`

In [None]:
def data_encoding( encoding_strategy , encoding_data , encoding_columns ):
    
    if encoding_strategy == "LabelEncoding":
        print("IF LabelEncoding")
        Encoder = LabelEncoder()
        for column in encoding_columns :
            print("column",column )
            encoding_data[ column ] = Encoder.fit_transform(tuple(encoding_data[ column ]))
        
    elif encoding_strategy == "OneHotEncoding":
        print("ELIF OneHotEncoding")
        encoding_data = pd.get_dummies(encoding_data)
        
    dtypes_list =['float64','float32','int64','int32']
    encoding_data.astype( dtypes_list[0] ).dtypes
    
    return encoding_data

In [None]:
encoding_columns  = [ "region", "age","department", "education", "gender", "recruitment_channel" ]
encoding_strategy = [ "LabelEncoding", "OneHotEncoding"]

train_encode = data_encoding( encoding_strategy[1] , train , encoding_columns )
test_encode =  data_encoding( encoding_strategy[1] , test  , encoding_columns )

In [None]:
test_encode.head()

In [None]:
train_encode.head()

## FEATURE SCALING

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
def data_scaling( scaling_strategy , scaling_data , scaling_columns ):
    
    if    scaling_strategy =="RobustScaler" :
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="StandardScaler" :
        scaling_data[scaling_columns] = StandardScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MinMaxScaler" :
        scaling_data[scaling_columns] = MinMaxScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MaxAbsScaler" :
        scaling_data[scaling_columns] = MaxAbsScaler().fit_transform(scaling_data[scaling_columns])
        
    else :  # If any other scaling send by mistake still perform Robust Scalar
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
    
    return scaling_data

In [None]:
scaling_st=["RobustScaler" ,"StandardScaler","MinMaxScaler","MaxAbsScaler"]

train_scale=data_scaling(scaling_st[0],train_encode,train_encode.columns)
test_scale=data_scaling(scaling_st[0],test_encode,test_encode.columns)

## Split target variable and predictors

In [None]:
X=train_scale.drop(['is_promoted'],axis=1)
Y=train.is_promoted

## Test train split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,stratify=Y)

## OVERSAMPLING
### Checking of the data is unbalanced or balanced 

In [None]:
Y_train.value_counts(normalize=True)*100

###  Cleary we can see data is unbalanced with only 8% of 1s in it so we have to use over Sampling on the data so as to make it balanced
### using Smote Over Sampling Method 

In [None]:
def oversample(X,Y):
    over_sample = SMOTETomek(random_state=rs)
    X_over,Y_over = over_sample.fit_resample(X,Y)
    return X_over,Y_over

In [None]:
X_train_os,Y_train_os=oversample(X_train,Y_train)

### FEATURE SELECTION

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, Y_train)

### FEATURE SCORES 
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_scores)

### PLOT TO VISUALIZE
sns.barplot(x=feature_scores, y=feature_scores.index)
# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
# Add title to the graph
plt.title("Visualizing Important Features")
# Visualize the graph
plt.show()


## Model Building 

#### Here i have used Multiple Algoriths starting from 
#### 1.Randomforest 
#### 2.Decision tree
#### 3.CatBoost
#### 4.XG Boost 
#### 5.LGBM 

## 1.RANDOMFOREST CLASSIFIER

In [None]:
from sklearn.ensemble          import RandomForestClassifier
from sklearn.tree              import DecisionTreeClassifier

from sklearn.metrics           import accuracy_score
from sklearn.metrics           import classification_report
from sklearn.metrics           import confusion_matrix

from sklearn.model_selection   import RandomizedSearchCV
from sklearn.model_selection   import KFold,cross_val_score


In [None]:
## PASSING THE TRAIN DATA IN THE CROSS VALIDATION
kf=KFold(n_splits=5,random_state=rs,shuffle=True)
cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X_train_os, Y_train_os):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1

In [None]:
score = cross_val_score(RandomForestClassifier(random_state= rs), X_train_os, Y_train_os, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

In [None]:
rfc=RandomForestClassifier(random_state=rs)
rfc.fit(X_train_os,Y_train_os)
y_pred_rf=rfc.predict(X_train_os)
print(accuracy_score(y_pred_rf,Y_train_os))

In [None]:
cm = confusion_matrix(Y_train_os, y_pred_rf)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
y_pred_test=rfc.predict(X_test)
print(accuracy_score(y_pred_test,Y_test))

In [None]:
cm = confusion_matrix(Y_test, y_pred_test)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
print(classification_report(Y_test,y_pred_test))

### Hyperparameter Tuning 
- RANDOM FOREST CLASSIFIER


In [None]:
#Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid={'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf}

In [None]:
rf=RandomForestClassifier()

In [None]:
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
                                scoring='f1',
                                n_iter = 10, cv = 5,
                                verbose=2, random_state=rs,
                                n_jobs = 1)
rf_random.fit(X_train_os,Y_train_os)

In [None]:
rf_random.best_params_

- {'n_estimators': 1100,
- 'min_samples_split': 5,
- 'min_samples_leaf': 1,
- 'max_features': 'auto',
- 'max_depth': 20}

In [None]:
rfc= RandomForestClassifier(random_state=rs,
                            n_estimators=1100,
                            min_samples_split=5,
                            max_features='auto',
                            min_samples_leaf= 1,
                            max_depth=20,oob_score=True)
rfc.fit(X_train_os,Y_train_os)

In [None]:
y_pred_rf_ht=rfc.predict(X_train_os)
print(accuracy_score(y_pred_rf_ht,Y_train_os))
cm = confusion_matrix(Y_train_os, y_pred_rf_ht)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
y_pred_test_ht=rfc.predict(X_test)
print(accuracy_score(y_pred_test_ht,Y_test))

In [None]:
cm = confusion_matrix(Y_test, y_pred_test_ht)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

## 2. DECISION TREE


In [None]:
dtc=DecisionTreeClassifier(random_state=rs)
score = cross_val_score(dtc, X_train_os, Y_train_os, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

### Hyperparameter Tuning 
- Decision Tree CLASSIFIER


In [None]:
def hyperparameter_tuning(X,Y,rf):
#Randomized Search CV
# Number of features to consider at every split
    max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 5, 10]

    random_grid={
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf}

    rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
                                scoring='f1', 
                                n_iter = 10, cv = 5,
                                verbose=0, random_state=rs,
                                n_jobs = 1)
    rf_random.fit(X,Y)
    return rf_random.best_params_

In [None]:
param_dt=hyperparameter_tuning(X_train_os,Y_train_os,dtc)

In [None]:
## Printing the best parameters obtained after randomizesearch CV or hyperparameter tuning
print(param_dt)

- {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 25}

In [None]:
dtc=DecisionTreeClassifier(random_state=rs,
                           min_samples_split=10,
                           min_samples_leaf=2,
                           max_features='auto',
                           max_depth=25)
dtc.fit(X_train_os,Y_train_os)
y_pred_train_dc_ht=dtc.predict(X_train_os)
y_pred_test_dt_ht=dtc.predict(X_test)
print('Test Accuracy',accuracy_score(y_pred_test_dt_ht,Y_test))
print('Train Accuracy',accuracy_score(y_pred_train_dc_ht,Y_train_os))

# Boosting algorithms

In [None]:
# Boosting Algorithms :
from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier

from scipy.stats                      import randint

### Hyperparameter Tuning 
- CatBoostClassifier


In [None]:
mod= CatBoostClassifier(random_state=rs)

par={'max_depth':[5,10,None],
              'n_estimators':[200,300,400,500,600],'learning_rate':[0.1,0.01,0.001]}
def hyperparameter_tuning(mod,param_d,p,q):
    rdmsearch=  RandomizedSearchCV(mod, param_distributions=param_d,n_jobs=-1,cv=9,scoring='roc_auc')
    rdmsearch.fit(p,q)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score


rf_parameters, rf_ht_score = hyperparameter_tuning(mod, par,  X_train_os,Y_train_os)


In [None]:
print(rf_parameters, rf_ht_score)

In [None]:
mod=XGBClassifier(random_state=rs)
param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10], 
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500]
                }
rf_parameters_xgb, rf_ht_score_xgb = hyperparameter_tuning(mod, param_tuning,  X_train_os,Y_train_os)

In [None]:
print(rf_parameters_xgb, rf_ht_score_xgb)

In [None]:
lgb = LGBMClassifier()
lgb.fit(X_train_os,Y_train_os)

lgb_pred = lgb.predict(X_test)

print("Training Accuracy :", lgb.score(X_train_os, Y_train_os))

In [None]:
# Create a Dictionary (Key->Value Pairs) for "ML Model Name"-> "ML Model Functions with Hyper-Parameters" :

Classifiers = {'0.XGBoost' : XGBClassifier(learning_rate =0.1, 
                                           n_estimators=394, 
                                           max_depth=10, 
                                           subsample = 0.50, 
                                           verbosity = 0,
                                           scale_pos_weight = 2.5,
                                           updater ="grow_histmaker",
                                           base_score  = 0.2,
                                          min_child_weight=1),
                            
               '1.CatBoost' : CatBoostClassifier(learning_rate=0.1, 
                                                 n_estimators=300, 
                                                 subsample=0.085, 
                                                 max_depth=10, 
                                                 scale_pos_weight=2.5),
               
               '2.LightGBM' : LGBMClassifier(subsample_freq = 2, 
                                             objective ="binary",
                                             importance_type = "gain",
                                             verbosity = -1, 
                                             max_bin = 60,
                                             num_leaves = 300,
                                             boosting_type = 'dart',
                                             learning_rate=0.10, 
                                             n_estimators=494,
                                             max_depth=10, 
                                             scale_pos_weight=2.5)
 }

print( list(Classifiers.keys()) )
print("--#--"*25)
print( list(Classifiers.values()) )

**Used Voting classifier** -A Voting Classifier is a machine learning model that trains on an ensemble of numerous models and predicts an output (class) based on their highest probability of chosen class as the output.

 **Soft Voting** -In soft voting, the output class is the prediction based on the average of probability given to that class. Suppose given some input to three models, the prediction probability for class A = (0.30, 0.47, 0.53) and B = (0.20, 0.32, 0.40). So the average for class A is 0.4333 and B is 0.3067, the winner is clearly class A because it had the highest probability averaged by each classifier.


In [None]:
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(estimators=[
                                              ('XGBoost_Best', list(Classifiers.values())[0]), 
                                              ('CatBoost_Best', list(Classifiers.values())[1]),
                                              ('LightGBM_Best', list(Classifiers.values())[2]),
                                             ], 
                                              voting='soft',weights=[5,5,5.2])

voting_model.fit(X_train_os,Y_train_os) 

predictions_of_voting = voting_model.predict_proba( test_encode )[::,1]

In [None]:
predictions_of_voting


In [None]:
y_pred_class = [int(round(value)) for value in predictions_of_voting]

In [None]:
### Final ensembel model after hyperparameter tuning 

In [None]:
# Data Visualization Libraries :
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

In [None]:
Classifiers = {'0.XGBoost' : XGBClassifier(learning_rate =0.1, 
                                           n_estimators=394, 
                                           max_depth=5,
                                           subsample = 0.70, 
                                           verbosity = 0,
                                           scale_pos_weight = 2.5,
                                           updater ="grow_histmaker",
                                           base_score  = 0.2),
               
            
               '1.CatBoost' : CatBoostClassifier(learning_rate=0.15, 
                                                 n_estimators=300, 
                                                 max_depth=5, 
                                                 scale_pos_weight=2.5,
                                                verbose=False),
               
               '2.LightGBM' : LGBMClassifier(learning_rate=0.15, 
                                             n_estimators=494,
                                             subsample_freq = 2, 
                                             objective ="binary",
                                             importance_type = "gain",
                                             verbosity = -1, 
                                             max_bin = 60,
                                             num_leaves = 300,
                                             boosting_type = 'dart',                                            
                                             max_depth=5, 
                                             scale_pos_weight=2.5)
                }

print( list(Classifiers.keys()) )

clf1 = list(Classifiers.values())[0]
clf2 =list(Classifiers.values())[1]
clf3 = list(Classifiers.values())[2]
X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])

eclf = VotingClassifier(estimators=[('xgboost', clf1), 
                                    ('catboost', clf2), 
                                    ('lgbm', clf3)],
                        voting='soft',
                        weights=[5, 5, 5.2])

# predict class probabilities for all classifiers
probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)]

# get class probabilities for the first sample in the dataset
class1_1 = [pr[0, 0] for pr in probas]
class2_1 = [pr[0, 1] for pr in probas]

# plotting

N = 4  # number of groups
ind = np.arange(N)  # group positions
width = 0.35  # bar width

fig, ax = plt.subplots()

# bars for classifier 1-3
p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width,
            color='green', edgecolor='k')
p2 = ax.bar(ind + width, np.hstack(([class2_1[:-1], [0]])), width,
            color='lightgreen', edgecolor='k')

# bars for VotingClassifier
p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width,
            color='blue', edgecolor='k')
p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]], width,
            color='steelblue', edgecolor='k')

# plot annotations
plt.axvline(2.8, color='k', linestyle='dashed')
ax.set_xticks(ind + width)
ax.set_xticklabels(['XGBoost\nweight 5',
                    'CatBoost\nweight 5',
                    'LightGBM\nweight 5.2',
                    'VotingClassifier\n(average probabilities)'],
                   rotation=40,
                   ha='right')
plt.ylim([0, 1])
plt.title('Class probabilities for sample 1 by different classifiers')
plt.legend([p1[0], p2[0]], ['is_promoted=No', 'is_promoted=Yes'], loc='upper right')
plt.tight_layout()
plt.show()

# If you like this kernel please upvote and make this kernel reach more people