In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
def fill_missing(df, column, value):
    df_copy = df.copy()
    df_copy[column].fillna(value=value,inplace=True)
    return df_copy

train_data = fill_missing(train_data, "education", "others")
test_data = fill_missing(test_data, "education", "others")

train_data = fill_missing(train_data, "previous_year_rating", 0.0)
test_data = fill_missing(test_data, "previous_year_rating", 0.0)

train_data.education.isnull().sum()

In [None]:
train_data.department.value_counts()

In [None]:
train_data.education.value_counts()

In [None]:
train_data.gender.value_counts()

In [None]:
train_data.recruitment_channel.value_counts()

In [None]:
def convert_to_category(df, column_list):
    df_copy = df.copy()
    df_copy[column_list] = df_copy[column_list].apply(lambda x: x.astype('category'))
    return df_copy

train_data = convert_to_category(train_data, ['department','region','education',
                                              'gender','recruitment_channel'])
test_data = convert_to_category(test_data, ['department','region','education',
                                            'gender','recruitment_channel'])

In [None]:
condition = bool(train_data.duplicated(subset = 'employee_id').any())

if condition:
    print('There are duplicate employee IDs')
else:
    print('No duplicate employee IDs')

In [None]:
train_data.is_promoted.value_counts(normalize=True)

In [None]:
train_data.describe()

In [None]:
train_data.drop(columns='employee_id',inplace=True)  
test_data.drop(columns='employee_id',inplace=True)  

train_data.hist(bins=20, figsize=(15,8))
plt.show()

In [None]:
def create_stacked_bar(column_name):

    promo_by_group = pd.crosstab(index=train_data['is_promoted'],columns = train_data[column_name], normalize = 'columns')
    promo_by_group = promo_by_group.apply(lambda x: round(x,2))
    
    labels = promo_by_group.columns
    list1 = promo_by_group.iloc[0].to_list()
    list2 = promo_by_group.iloc[1].to_list()
    
    list1_name = "Not promoted"
    list2_name = "Promoted"
    title = f"Promotion by {column_name}"
    xlabel = column_name
    ylabel = "Promotion percentage"
    
    fig, ax = plt.subplots(figsize=(10, 5))
    bar_width = 0.5
    
    ax1 = ax.bar(labels,list1, bar_width, label = list1_name)
    ax2 = ax.bar(labels,list2, bar_width, bottom = list1, label = list2_name)

    ax.set_title(title, fontweight = "bold")
    ax.set_xlabel(xlabel, fontweight = "bold")
    ax.set_ylabel(ylabel, fontweight = "bold")
    ax.legend(loc="best")
    
    plt.xticks(list(range(len(labels))), labels,rotation=90)
    plt.yticks(fontsize=9)

    for r1, r2 in zip(ax1, ax2):
        h1 = r1.get_height()
        h2 = r2.get_height()
        plt.text(r1.get_x() + r1.get_width() / 2., h1 / 2., f"{h1:.0%}", ha="center", va="center", color="white", fontsize=9, fontweight="bold")
        plt.text(r2.get_x() + r2.get_width() / 2., h1 + h2 / 2., f"{h2:.0%}", ha="center", va="center", color="white", fontsize=9, fontweight="bold")

    plt.show()

In [None]:
create_stacked_bar('department')

In [None]:
create_stacked_bar('education')

In [None]:
create_stacked_bar('region')

In [None]:
create_stacked_bar('gender')

In [None]:
def convert_age_to_group(df):
    df_copy = df.copy()
    bins = range(20,61,5)    # every 5 years as a bin
    labels = list(range(len(bins)-1))
    df_copy['age_group'] = pd.cut(df_copy['age'],bins=bins, labels=labels, right=True, include_lowest=True)
    df_copy.drop(columns=["age"], inplace=True)
    return df_copy

train_data = convert_age_to_group(train_data)
test_data = convert_age_to_group(test_data)

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

def convert_to_numerical(df_train, df_test):
    df_train_copy = df_train.copy()
    df_test_copy = df_test.copy()
    
    for i in ["department", "region", "education", "gender", "recruitment_channel", "age_group"]:
        le = LabelEncoder()
        le.fit(df_train_copy.loc[:, i])
        df_train_copy.loc[:, i] = le.transform(df_train_copy.loc[:, i])
        df_test_copy.loc[:, i] = le.transform(df_test_copy.loc[:, i])
    
    return df_train_copy, df_test_copy

train_data, test_data = convert_to_numerical(train_data, test_data)

In [None]:
X = train_data.drop(columns=['is_promoted'])
y = train_data['is_promoted']

X_test = test_data.copy()

feature_cols = X.columns.tolist() 

num_cols = ['no_of_trainings', 'previous_year_rating', 'length_of_service','KPIs_met >80%',
            'awards_won?', 'avg_training_score']

3. Split Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

def data_split(X, y, imbalance = False):
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3,shuffle=True,
                                                                    stratify=y, random_state=42)
    
    if imbalance:
        sm = SMOTE(random_state = 42)
        X_train, y_train = sm.fit_sample(X_train, y_train.ravel())
    
    return X_train, X_validation, y_train, y_validation

4. Rescale Features

In [None]:
from sklearn.preprocessing import StandardScaler

def standard_scaler(X_train, X_validation, X_test,  numerical_cols):
    
    X_train_std = X_train.copy()
    X_validation_std = X_validation.copy()
    X_test_std = X_test.copy()
    
    for i in numerical_cols:
        scl = StandardScaler().fit(X_train_std[[i]])     
        X_train_std[i] = scl.transform(X_train_std[[i]]) 
        X_validation_std[i] = scl.transform(X_validation_std[[i]])   
        X_test_std[i] = scl.transform(X_test_std[[i]])   

    return X_train_std, X_validation_std, X_test_std

In [None]:
conda install -c conda-forge xgboost

Predictive Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import joblib

def run_models(X, y, X_test, num_cols, models):
    
    model_result = []

    for imbalance in [True, False]:
        X_train, X_validation, y_train, y_validation = data_split(X, y, imbalance = imbalance)
        X_train_std, X_validation_std, X_test_std = standard_scaler(X_train, X_validation, X_test
                                                                    , numerical_cols = num_cols)
       
        for model_name, model in models.items():
            model.fit(X_train_std, y_train)
            joblib.dump(model, f"{model_name}.pkl")  
            scores = cross_val_score(model, X_train_std, y_train, scoring ="roc_auc", cv = 5)
            roc_auc = np.mean(scores)

            model_result.append([model_name, imbalance,  roc_auc]) 
    df = pd.DataFrame(model_result, columns = ["Model", "SMOTE" , "ROC_AUC Score"])  
    df.to_csv("model_initial.csv", index=None)
    
    return df

In [None]:
model_dict = {"Logistic Regression":LogisticRegression(random_state=42), 
              "Random Forest":RandomForestClassifier(random_state=42), 
              "XGBoost":  XGBClassifier(random_state=42)}

run_models(X, y, X_test, num_cols, model_dict)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

X_train, X_validation, y_train, y_validation = data_split(X, y, imbalance = True)

X_train_std, X_validation_std, X_test_std = standard_scaler(X_train, X_validation, X_test, numerical_cols = num_cols)

logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,random_state=42)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
lr_best = RandomizedSearchCV(logistic, distributions, random_state=42)

lr_best= lr_best.fit(X_train_std, y_train)   

print(lr_best.best_params_)

In [None]:
joblib.dump(lr_best,"logreg_tuned.pkl")

In [None]:
scores_tuned = cross_val_score(lr_best, X_validation_std, y_validation, scoring = "roc_auc", cv = 5)
roc_auc_lr_best = np.mean(scores_tuned)

joblib.dump(roc_auc_lr_best,"logreg_ROC_AUC_tuned.pkl") 

print(f'ROC_AUC score after tuning parameters:{roc_auc_lr_best:.3f}')

In [None]:
from sklearn.model_selection import GridSearchCV

X_train, X_validation, y_train, y_validation = data_split(X, y, imbalance = True)
param_grid = {
    'max_depth': [60, 90, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

clf_rf = RandomForestClassifier(random_state=42)

rf_best = GridSearchCV(estimator = clf_rf, param_grid = param_grid,    
                          cv = 3, n_jobs = -1, verbose = 1)

rf_best.fit(X_train, y_train)
rf_best.best_params_

In [None]:
joblib.dump(rf_best,"clf_rf_tuned.pkl")

In [None]:
from sklearn.model_selection import GridSearchCV

X_train, X_validation, y_train, y_validation = data_split(X, y, imbalance = True)

param_grid = {
    'max_depth': [50,60,70],
    'min_samples_leaf': [2,3],
    'min_samples_split': [6,7,8],
    'n_estimators': [200,300,400]
}

clf_rf = RandomForestClassifier(random_state=42)

rf_best1 = GridSearchCV(estimator = clf_rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)

rf_best1.fit(X_train, y_train)
rf_best1.best_params_

In [None]:
scores_tuned = cross_val_score(rf_best1, X_validation, y_validation, scoring = "roc_auc", cv = 5)
roc_auc_rf_best = np.mean(scores_tuned)

joblib.dump(roc_auc_rf_best,"rf_ROC_AUC_tuned.pkl") 

print(f'ROC_AUC score after tuning parameters:{roc_auc_rf_best:.3f}')

In [None]:
from pprint import pprint
n_estimators = np.arange(200,1000,200)

gamma = np.arange(0.1,0.6,0.1)
learning_rate = np.arange(0.1,0.6,0.1)

max_depth = list(range(3,8,1))

subsample = np.arange(0.5,0.9,0.1)
colsample_bytree = np.arange(0.5,0.9,0.1)

scale_pos_weight = [1,3.5]
random_grid_xgb = {'n_estimators': n_estimators,
                   'gamma': gamma,
                   'learning_rate':learning_rate,
                   'max_depth': max_depth,
                   'subsample':subsample,
                   'colsample_bytree':colsample_bytree,
                   'scale_pos_weight':scale_pos_weight
                  }
pprint(random_grid_xgb)


In [None]:
X_train, X_validation, y_train, y_validation = data_split(X, y, imbalance = True)
xgboost = XGBClassifier()
xgb_random = RandomizedSearchCV(estimator = xgboost, 
                                param_distributions = random_grid_xgb, 
                                n_iter = 100, 
                                cv = 3, 
                                verbose=1, 
                                random_state=42, 
                                n_jobs = -1,
                                scoring ='roc_auc')


xgb_random.fit(X_train, y_train)   
xgb_random.best_params_, xgb_random.best_score_

print(xgb_random.best_params_,xgb_random.best_score_)


In [None]:
joblib.dump(xgb_random,"xgb_tuned.pkl")

In [None]:
scores_tuned = cross_val_score(xgb_random, X_validation, y_validation, scoring = "roc_auc", cv = 5)
roc_auc_xgb_best = np.mean(scores_tuned)

joblib.dump(roc_auc_xgb_best,"xgb_ROC_AUC_tuned.pkl") 

print(f'ROC_AUC score after tuning parameters:{roc_auc_xgb_best:.3f}')

In [None]:
from sklearn.metrics import confusion_matrix

def get_pre_rec_f1(model_name, model,X_validation,y_validation):
    y_pred = model.predict(X_validation)
    tn, fp, fn, tp = confusion_matrix(y_validation, y_pred).ravel()
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = 2 * (precision * recall) / (precision + recall)
    
    return [model_name, precision, recall, F1]

In [None]:
col_1 = get_pre_rec_f1("Logistic", lr_best, X_validation_std, y_validation)

col_2 = get_pre_rec_f1("Random Forest",  rf_best1, X_validation, y_validation)

col_3 = get_pre_rec_f1("XGBoost", xgb_random, X_validation, y_validation)

result = []
result.append(col_1)
result.append(col_2)
result.append(col_3)

pd.DataFrame(result, columns = ["Model", "Precision", "Recall", "F1"])

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve

fig,ax=plt.subplots(figsize=(10,5))

plot_roc_curve(lr_best, X_validation_std, y_validation,ax=ax, color="blue",label='Logistic Regression')
plot_roc_curve(rf_best1, X_validation, y_validation,ax=ax, color="green",label='Random Forest')
plot_roc_curve(xgb_random, X_validation, y_validation,ax=ax, color="red",label='XGBoost')

plt.title('ROC/AUC of 3 models')
plt.grid()

In [None]:
test_data = pd.read_csv('F:/test.csv')

y_prediction = xgb_random.predict(X_test)
result_submission = pd.DataFrame({"employee_id" : test_data.employee_id, "is_promoted" : y_prediction})
result_submission.to_csv("F:/submission.csv", index=None)