![](https://icma.org/sites/default/files/Talent-Development.jpg)

In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)

In [None]:
hr = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

# Basic information

In [None]:
hr.head()

In [None]:
hr.info()

# Preparing data for EDA 

The missing values in this data are not the ones that can be easily to impute, because if you restore incorrectly, you may actually see non-existent correlations and, in general, the data logic may be lost. Therefore, the EDA will be performed on all available data, and for modeling, all rows with missing values will be deleted.

In [None]:
hr = hr.drop(['enrollee_id', 'city'], axis = 1)

In [None]:
hr['company_size'].unique()

In [None]:
for i in range(len(hr.index)):
    if hr['company_size'][i] == '10/49':
        hr['company_size'][i] = '10-49'

In [None]:
hr['experience'].unique()

In [None]:
for i in range(len(hr.index)):
    if hr['experience'][i] == '>20':
        hr['experience'][i] = '21'
    elif hr['experience'][i] == '<1':
        hr['experience'][i] = '0'

In [None]:
hr['last_new_job'].unique()

In [None]:
for i in range(len(hr.index)):
    if hr['last_new_job'][i] == '>4':
        hr['last_new_job'][i] = '5'
    elif hr['last_new_job'][i] == 'never':
        hr['last_new_job'][i] = '0'

In [None]:
retarget = {0.0: 'Not looking for job change',
           1.0: 'Looking for job change'}
hr['target'] = hr['target'].map(retarget)

# EDA

In [None]:
target = hr.groupby('target').agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()

fig = px.pie(target, values = 'count', names = 'target')
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.5, 
                  marker = dict(colors = ['#32384D','#E29930'], line = dict(color = 'white', width = 2)))

fig.update_layout(title_text = 'Job search', title_x = 0.5, title_y = 0.53, title_font_size = 32, title_font_family = 'Calibri Black', title_font_color = 'black',
                  showlegend = False)
                  
fig.show()

# Distribution of job changing by gender, relevet experience, education, enrolled university, major discipline and company type

In [None]:
gender = hr.groupby(['gender', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()
experience = hr.groupby(['relevent_experience', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()
education_level = hr.groupby(['education_level', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()
enrolled = hr.groupby(['enrolled_university', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()
major_discipline = hr.groupby(['major_discipline', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()
company_type = hr.groupby(['company_type', 'target']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()

In [None]:
fig = plt.figure(figsize = (22, 22))
fig.patch.set_facecolor('#fafafa')

plt.subplot(321)
sns.set_style('white')
plt.title('Gender', size = 14)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
a = sns.barplot(data = gender, x = gender['gender'], y = gender['count'], hue = gender['target'], palette = ['#32384D', '#E29930'])
for p in a.patches:
    height = p.get_height()
    a.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend(loc = 'upper left')

plt.subplot(322)
plt.title('Relevent experience', size = 14)
a2 = sns.barplot(data = experience, x = experience['relevent_experience'], y = experience['count'], hue = experience['target'], palette = ['#32384D', '#E29930'])
for p in a2.patches:
    height = p.get_height()
    a2.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(323)
plt.title('Education', size = 14)
a3 = sns.barplot(data = education_level, x = education_level['education_level'], y = education_level['count'], hue = education_level['target'], palette = ['#32384D', '#E29930'])
for p in a3.patches:
    height = p.get_height()
    a3.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(324)
plt.title('Enrolled university', size = 14)
a4 = sns.barplot(data = enrolled, x = enrolled['enrolled_university'], y = enrolled['count'], hue = enrolled['target'], palette = ['#32384D', '#E29930'])
for p in a4.patches:
    height = p.get_height()
    a4.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(325)
plt.title('Major discipline', size = 14)
a5 = sns.barplot(data = major_discipline, x = major_discipline['major_discipline'], y = major_discipline['count'], hue = major_discipline['target'], palette = ['#32384D', '#E29930'])
for p in a5.patches:
    height = p.get_height()
    a5.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(326)
plt.title('Company type', size = 14)
a6 = sns.barplot(data = company_type, x = company_type['company_type'], y = company_type['count'], hue = company_type['target'], palette = ['#32384D', '#E29930'])
for p in a6.patches:
    height = p.get_height()
    a6.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.show()

In [None]:
# Function for calculating the percentage of people in each group
def percent(data):
    data['percent'] = 0
    for i in range(len(data.index)):
        if data.index[i] % 2 == 0:
            data.iloc[i, 3] = round((data.iloc[i, 2] / (data.iloc[i, 2] + data.iloc[i+1, 2])) * 100, 1)
        else:
            data.iloc[i, 3] = round((data.iloc[i, 2] / (data.iloc[i, 2] + data.iloc[i-1, 2])) * 100, 1)
            
percent(gender)
percent(experience)
percent(education_level)
percent(enrolled)
percent(major_discipline)
percent(company_type)

In [None]:
fig = plt.figure(figsize = (22, 22))
fig.patch.set_facecolor('#fafafa')

plt.subplot(321)
sns.set_style('white')
plt.title('Gender', size = 14)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
a = sns.barplot(data = gender, x = gender['gender'], y = gender['percent'], hue = gender['target'], palette = ['#32384D', '#E29930'])
for p in a.patches:
    height = p.get_height()
    a.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(322)
plt.title('Relevent experience', size = 14)
a2 = sns.barplot(data = experience, x = experience['relevent_experience'], y = experience['percent'], hue = experience['target'], palette = ['#32384D', '#E29930'])
for p in a2.patches:
    height = p.get_height()
    a2.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend(loc = 'upper right')

plt.subplot(323)
plt.title('Education', size = 14)
a3 = sns.barplot(data = education_level, x = education_level['education_level'], y = education_level['percent'], hue = education_level['target'], palette = ['#32384D', '#E29930'])
for p in a3.patches:
    height = p.get_height()
    a3.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(324)
plt.title('Enrolled university', size = 14)
a4 = sns.barplot(data = enrolled, x = enrolled['enrolled_university'], y = enrolled['percent'], hue = enrolled['target'], palette = ['#32384D', '#E29930'])
for p in a4.patches:
    height = p.get_height()
    a4.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(325)
plt.title('Major discipline', size = 14)
a5 = sns.barplot(data = major_discipline, x = major_discipline['major_discipline'], y = major_discipline['percent'], hue = major_discipline['target'], palette = ['#32384D', '#E29930'])
for p in a5.patches:
    height = p.get_height()
    a5.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.subplot(326)
plt.title('Company type', size = 14)
a6 = sns.barplot(data = company_type, x = company_type['company_type'], y = company_type['percent'], hue = company_type['target'], palette = ['#32384D', '#E29930'])
for p in a6.patches:
    height = p.get_height()
    a6.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('')

plt.show()

# Distribution of job changing by cdi, experience, company size, last job and training hours

In [None]:
exp_no_nan = hr[pd.isna(hr['experience']) == False]
exp_no_nan['experience'] = exp_no_nan['experience'].astype('int')

lastjob_no_nan = hr[pd.isna(hr['last_new_job']) == False]
lastjob_no_nan['last_new_job'] = lastjob_no_nan['last_new_job'].astype('int')

In [None]:
fig = plt.figure(figsize = (20, 20))
fig.patch.set_facecolor('#fafafa')

plt.subplot(221)
sns.set_style('white')
plt.title('City Development Index', size = 14)
sns.kdeplot(hr.query('target == "Looking for job change"')['city_development_index'], color = '#32384D', shade = True, label = 'Looking for job change', alpha = 0.5)
sns.kdeplot(hr.query('target == "Not looking for job change"')['city_development_index'], color = '#E29930', shade = True, label = 'Not looking for job change', alpha = 0.5)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])
plt.legend(loc = 'upper left')

plt.subplot(222)
plt.title('Years of experience', size = 14)
sns.kdeplot(exp_no_nan.query('target == "Looking for job change"')['experience'], color = '#32384D', shade = True, label = 'Looking for job change', alpha = 0.5)
sns.kdeplot(exp_no_nan.query('target == "Not looking for job change"')['experience'], color = '#E29930', shade = True, label = 'Not looking for job change', alpha = 0.5)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.subplot(223)
plt.title('Training hours', size = 14)
sns.kdeplot(hr.query('target == "Looking for job change"')['training_hours'], color = '#32384D', shade = True, label = 'Looking for job change', alpha = 0.5)
sns.kdeplot(hr.query('target == "Not looking for job change"')['training_hours'], color = '#E29930', shade = True, label = 'Not looking for job change', alpha = 0.5)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.subplot(224)
plt.title('Years from last job to current job', size = 14)
sns.kdeplot(lastjob_no_nan.query('target == "Looking for job change"')['last_new_job'], color = '#32384D', shade = True, label = 'Looking for job change', alpha = 0.5)
sns.kdeplot(lastjob_no_nan.query('target == "Not looking for job change"')['last_new_job'], color = '#E29930', shade = True, label = 'Not looking for job change', alpha = 0.5)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.show()

In [None]:
cs = hr.groupby(['target', 'company_size']).agg({'target': 'count'}).rename(columns = {'target': 'count'}).reset_index()

fig = px.sunburst(cs, path = ['target', 'company_size'], values = 'count', color = 'target',
                 color_discrete_map = {'Looking for job change': '#32384D', 'Not looking for job change': '#E29930'},
                 width = 700, height = 700)

fig.update_layout(annotations = [dict(text = 'Affect of company size on the desire to change job', 
                                      x = 0.5, y = 1.1, font_size = 24, showarrow = False, 
                                      font_family = 'Calibri Black',
                                      font_color = 'black')])

fig.update_traces(textinfo = 'label + percent parent')
                  
fig.show()

# Conclusions of EDA

1. People with no relevant experience are more inclined to search for a new job.
2. Specialists with graduate education are more likely than others to look for a new job.
3. People who signed up for the full time course are more likely than others to look for a new job.
4. People who have a major discipline STEM (Science, Technology, Engineering and Mathematics) are more likely than others to look for a new job.
5. The CDI (City Development Index) has a big role in the desire to change job: more than half of the specialists with a low CDI are looking for a new job - in cities with a high CDI, which is not strange, the situation is the opposite, more than half of the specialists are not interested in finding a new job.
6. People working in Data Science for the first 8 years are more likely to look for a new job, and more than half of the specialists working in this field for more than 20 years are not looking for a new job.

# Preparing data for modeling

In [None]:
hr.dropna(inplace = True)

retarget2 = {'Not looking for job change': 0, 'Looking for job change': 1}
hr['target'] = hr['target'].map(retarget2)

hr['experience'] = hr['experience'].astype('int')
        
hr['last_new_job'] = hr['last_new_job'].astype('int')

Some more EDA :) The correlation map was not made earlier, because there were missing values and it was impossible to convert the necessary columns to a numeric format.

In [None]:
matrix = np.triu(hr.corr())
plt.figure(figsize=(13, 10))
sns.heatmap(hr.corr(), annot = True, cmap = 'YlOrBr', fmt=".2f", mask = matrix,
            vmin = -1, vmax = 1, linewidths = 0.1, linecolor = 'white', cbar = False)
plt.show()

In [None]:
X = hr.drop(['target'], axis = 1)
y = hr['target']

num_cols = X.select_dtypes(include = ['int64', 'float64']).columns.to_list()
cat_cols = X.select_dtypes(include = ['object']).columns.to_list()

def label_encoder(df):
    for i in cat_cols:
        le = LabelEncoder()
        df[i] = le.fit_transform(df[i])
    return df

sc = StandardScaler()
X[num_cols] = sc.fit_transform(X[num_cols])

X = label_encoder(X)

X.head()

# Modeling

In [None]:
hr['target'].value_counts()

It's not good idea to modeling with imbalanced data, so I will use SMOTE (Synthetic Minority Over-sampling Technique) - one of the most commonly used resampling techniques to solve the imbalance problem.

In [None]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021)

smote = SMOTE()

X_train_balanced, Y_train_balanced = smote.fit_resample(X_train, y_train)

Y_train_balanced.value_counts()

**For modeling I will use 5 models:**

1. Logistic Regression
2. Random Forest
3. XGB
4. LGBM
5. CatBoost

And also I will check Cross-validation with 5 folds.

In [None]:
results = pd.DataFrame(columns = ['LR', 'RF', 'XGB', 'LGBM', 'CB'], index = range(4))

# Logistic Regression

In [None]:
lg = LogisticRegression(random_state = 2021)
lg.fit(X_train_balanced, Y_train_balanced)
y_pred = lg.predict(X_test)
y_prob = lg.predict_proba(X_test)[:,1]

# Metrics
results.iloc[0, 0] = round(precision_score(y_test, y_pred), 2)
results.iloc[1, 0] = round(recall_score(y_test, y_pred), 2)
results.iloc[2, 0] = round(f1_score(y_test, y_pred), 2)
results.iloc[3, 0] = round(roc_auc_score(y_test, y_prob), 3)
lg_cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {round(roc_auc_score(y_test, y_prob), 3)}')
print('')
print('-----------------------------------------------------')
print('')
print('Cross-validation scores with 5 folds:')
print('')
print(f"ROC AUC: {round(cross_val_score(lg, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'roc_auc').mean(), 3)}")
print(f"precision: {round(cross_val_score(lg, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'precision').mean(), 2)}")
print(f"recall: {round(cross_val_score(lg, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'recall').mean(), 2)}")
print(f"f1: {round(cross_val_score(lg, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'f1').mean(), 2)}")

# Visualize confusion matrix
plt.figure(figsize = (8, 5))
sns.heatmap(lg_cm, cmap = 'YlOrBr', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Not looking for job change', 'Looking for job change'], xticklabels = ['Predicted not looking for job change', 'Predicted looking for job change'])
plt.yticks(rotation = 0)
plt.show()

# Roc curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Feature importance
f_imp = pd.DataFrame(columns = ['feature', 'importance (abs coef)'], index = range(11))
for i in range(len(f_imp.index)):
    f_imp.iloc[i, 0] = X_train_balanced.columns.to_list()[i]
f_imp['importance (abs coef)'] = abs(lg.coef_)[0]
f_imp = f_imp.sort_values('importance (abs coef)', ascending = False)
f_imp[0:11].style.background_gradient(cmap = 'YlOrBr')

# Random Forest

In [None]:
rf = RandomForestClassifier(random_state = 2021, max_depth = 5)
rf.fit(X_train_balanced, Y_train_balanced)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:,1]

# Metrics
results.iloc[0, 1] = round(precision_score(y_test, y_pred), 2)
results.iloc[1, 1] = round(recall_score(y_test, y_pred), 2)
results.iloc[2, 1] = round(f1_score(y_test, y_pred), 2)
results.iloc[3, 1] = round(roc_auc_score(y_test, y_prob), 3)
rf_cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {round(roc_auc_score(y_test, y_prob), 3)}')
print('')
print('-----------------------------------------------------')
print('')
print('Cross-validation scores with 5 folds:')
print('')
print(f"ROC AUC: {round(cross_val_score(rf, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'roc_auc').mean(), 3)}")
print(f"precision: {round(cross_val_score(rf, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'precision').mean(), 2)}")
print(f"recall: {round(cross_val_score(rf, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'recall').mean(), 2)}")
print(f"f1: {round(cross_val_score(rf, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'f1').mean(), 2)}")

# Visualize confusion matrix
plt.figure(figsize = (8, 5))
sns.heatmap(rf_cm, cmap = 'YlOrBr', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
           yticklabels = ['Not looking for job change', 'Looking for job change'], xticklabels = ['Predicted not looking for job change', 'Predicted looking for job change'])
plt.yticks(rotation = 0)
plt.show()

# Roc curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Feature importance
f_imp2 = pd.DataFrame(columns = ['feature', 'importance'], index = range(11))
for i in range(len(f_imp2.index)):
    f_imp2.iloc[i, 0] = X_train_balanced.columns.to_list()[i]
f_imp2['importance'] = rf.feature_importances_
f_imp2 = f_imp2.sort_values('importance', ascending = False)
f_imp2[0:11].style.background_gradient(cmap = 'YlOrBr')

# XGB

In [None]:
xgb = XGBClassifier(random_state = 2021, max_depth = 5, objective = 'binary:logistic', eval_metric = 'logloss')
xgb.fit(X_train_balanced, Y_train_balanced)
y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]

# Metrics
results.iloc[0, 2] = round(precision_score(y_test, y_pred), 2)
results.iloc[1, 2] = round(recall_score(y_test, y_pred), 2)
results.iloc[2, 2] = round(f1_score(y_test, y_pred), 2)
results.iloc[3, 2] = round(roc_auc_score(y_test, y_prob), 3)
xgb_cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {round(roc_auc_score(y_test, y_prob), 3)}')
print('')
print('-----------------------------------------------------')
print('')
print('Cross-validation scores with 5 folds:')
print('')
print(f"ROC AUC: {round(cross_val_score(xgb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'roc_auc').mean(), 3)}")
print(f"precision: {round(cross_val_score(xgb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'precision').mean(), 2)}")
print(f"recall: {round(cross_val_score(xgb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'recall').mean(), 2)}")
print(f"f1: {round(cross_val_score(xgb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'f1').mean(), 2)}")

# Visualize confusion matrix
plt.figure(figsize = (8, 5))
sns.heatmap(xgb_cm, cmap = 'YlOrBr', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
           yticklabels = ['Not looking for job change', 'Looking for job change'], xticklabels = ['Predicted not looking for job change', 'Predicted looking for job change'])
plt.yticks(rotation = 0)
plt.show()

# Roc curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Feature importance
f_imp3 = pd.DataFrame(columns = ['feature', 'importance'], index = range(11))
for i in range(len(f_imp3.index)):
    f_imp3.iloc[i, 0] = X_train_balanced.columns.to_list()[i]
f_imp3['importance'] = xgb.feature_importances_
f_imp3 = f_imp3.sort_values('importance', ascending = False)
f_imp3[0:11].style.background_gradient(cmap = 'YlOrBr')

# LGBM

In [None]:
lgbm = LGBMClassifier(random_state = 2021, max_depth = 5, num_leaves = 50)
lgbm.fit(X_train_balanced, Y_train_balanced)
y_pred = lgbm.predict(X_test)
y_prob = lgbm.predict_proba(X_test)[:,1]

# Metrics
results.iloc[0, 3] = round(precision_score(y_test, y_pred), 2)
results.iloc[1, 3] = round(recall_score(y_test, y_pred), 2)
results.iloc[2, 3] = round(f1_score(y_test, y_pred), 2)
results.iloc[3, 3] = round(roc_auc_score(y_test, y_prob), 3)
lgbm_cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {round(roc_auc_score(y_test, y_prob), 3)}')
print('')
print('-----------------------------------------------------')
print('')
print('Cross-validation scores with 5 folds:')
print('')
print(f"ROC AUC: {round(cross_val_score(lgbm, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'roc_auc').mean(), 3)}")
print(f"precision: {round(cross_val_score(lgbm, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'precision').mean(), 2)}")
print(f"recall: {round(cross_val_score(lgbm, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'recall').mean(), 2)}")
print(f"f1: {round(cross_val_score(lgbm, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'f1').mean(), 2)}")

# Visualize confusion matrix
plt.figure(figsize = (8, 5))
sns.heatmap(lgbm_cm, cmap = 'YlOrBr', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
           yticklabels = ['Not looking for job change', 'Looking for job change'], xticklabels = ['Predicted not looking for job change', 'Predicted looking for job change'])
plt.yticks(rotation = 0)
plt.show()

# Roc curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Feature importance
f_imp4 = pd.DataFrame(columns = ['feature', 'importance'], index = range(11))
for i in range(len(f_imp4.index)):
    f_imp4.iloc[i, 0] = X_train_balanced.columns.to_list()[i]
f_imp4['importance'] = lgbm.feature_importances_
f_imp4 = f_imp4.sort_values('importance', ascending = False)
f_imp4[0:11].style.background_gradient(cmap = 'YlOrBr')

# CatBoost

In [None]:
cb = CatBoostClassifier(random_state = 2021, depth = 5, iterations = 500, verbose = False)
cb.fit(X_train_balanced, Y_train_balanced)
y_pred = lgbm.predict(X_test)
y_prob = lgbm.predict_proba(X_test)[:,1]

# Metrics
results.iloc[0, 4] = round(precision_score(y_test, y_pred), 2)
results.iloc[1, 4] = round(recall_score(y_test, y_pred), 2)
results.iloc[2, 4] = round(f1_score(y_test, y_pred), 2)
results.iloc[3, 4] = round(roc_auc_score(y_test, y_prob), 3)
lgbm_cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {round(roc_auc_score(y_test, y_prob), 3)}')
print('')
print('-----------------------------------------------------')
print('')
print('Cross-validation scores with 5 folds:')
print('')
print(f"ROC AUC: {round(cross_val_score(cb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'roc_auc').mean(), 3)}")
print(f"precision: {round(cross_val_score(cb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'precision').mean(), 2)}")
print(f"recall: {round(cross_val_score(cb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'recall').mean(), 2)}")
print(f"f1: {round(cross_val_score(cb, X_train_balanced, Y_train_balanced, cv = 5, scoring = 'f1').mean(), 2)}")

# Visualize confusion matrix
plt.figure(figsize = (8, 5))
sns.heatmap(lgbm_cm, cmap = 'YlOrBr', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
           yticklabels = ['Not looking for job change', 'Looking for job change'], xticklabels = ['Predicted not looking for job change', 'Predicted looking for job change'])
plt.yticks(rotation = 0)
plt.show()

# Roc curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Feature importance
f_imp5 = pd.DataFrame(columns = ['feature', 'importance'], index = range(11))
for i in range(len(f_imp5.index)):
    f_imp5.iloc[i, 0] = X_train_balanced.columns.to_list()[i]
f_imp5['importance'] = cb.feature_importances_
f_imp5 = f_imp5.sort_values('importance', ascending = False)
f_imp5[0:11].style.background_gradient(cmap = 'YlOrBr')

# Results

In [None]:
plt.figure(figsize = (10, 7))
sns.heatmap(results[results.columns.to_list()].astype(float), cmap = 'YlOrBr', annot = True, linewidths = 1, cbar = False, annot_kws = {'fontsize': 12},
           yticklabels = ['Precision', 'Recall', 'F1', 'ROC AUC'])
sns.set(font_scale = 1.5)
plt.yticks(rotation = 0)
plt.show()

# 🚧🚧🚧WORK IN PROGRESS🚧🚧🚧