In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents

* [Introduction and objectives](#introduction)
* [Get familiar with the data](#familiar)
* [Data preprocessing](#preprocessing) 
    - [Deal with null values](#preprocessing-one)
    - [Deal with feature dtype casting](#preprocesing-two)
    - [Deal with feature dropping](#preprocessing-three)
    - [Deal with categorical features](#preprocessing-four)
    - [Deal with standardization](#preprocessing-five)
* [Build machine learning classifiers](#classifiers)
    - [Logistic regression](#logisticregression)
    - [Neural network](#neuralnetwork)
    - [K Nearest Neighbor](#knn)
* [Conclusion](#conclusion)

***Written by:*** *Fakhrul Hasbi*

<a id="introduction"></a>
## Introduction

As a brief context, a company would likely to hold training sessions for new signed up employees. However, there are some cases that not all of the employees that participated in the training session that will really proceed for the employment and might look for new employment in other companies. Therefore, to make the company's capital more effective and efficient (especially in the human resources), knowing which new signed up employees that want to work for the company could reduce the cost and time, as well as the quality of planning of the training courses and categorization of the candidates.

## Objectives

1. Identify the main factors of new signed up employees that will and won’t work for the company.
2. Predict the probability of new signed up employees that work for the company.

<a id="familiar"></a>
## Get familiar with the data 

*-> **Importing necessary** libraries*

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook")
plt.style.use('fivethirtyeight')
%matplotlib inline

*-> **Read** the csv file*

In [None]:
df = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv")

*-> **Quick checking** the dataframe*

In [None]:
df.head(5)

In [None]:
df.info()

<a id="preprocessing"></a>
## Data preprocessing 

<a id="preprocessing-one"></a>
### Deal with null values 

*-> Checking **null values**. If exists, then need to be removed. Refer to the comment for further detail.*

In [None]:
# make a heatmap to visualize the missing values
sns.heatmap(df.isnull(), cbar=False, yticklabels=False)

In [None]:
# stick with golden features that has <= 30% null or NaN values
golden_features = []
for col in df.columns:
    proportion = df[col].isnull().sum() / df.shape[0]
    if proportion < 0.3:
        golden_features.append(col)

# removed features
print("Removed features: {}".format(set(df.columns) - set(golden_features)))

# selected features
print("selected features: {}".format(set(df.columns) & set(golden_features)))

# updating the current dataframe with golden features
df = df[golden_features]

In [None]:
# checking the distribution of all of the object features to determine imputation strategy
for col in df.select_dtypes(include="object").columns:
    plt.figure()
    sns.countplot(data=df, x=col)
    plt.xlabel(col)
    plt.title("Countplot of " + col)

In [None]:
# from the countplot of object features above, we could see that a value is significantly dominant than the other for all of the object features.
# Thus, it might legitimate if we impute the missing values for all of the object features with most frequent value.
from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy="most_frequent")
df_objects = pd.DataFrame(data=si.fit_transform(df.select_dtypes(include="object")), columns=df.select_dtypes(include="object").columns)

# updating the current dataframe with imputed object features that stored in df_objects
for col in df_objects.columns:
    df[col] = df_objects[col]

In [None]:
# last checking to make sure that there are no more missing values in features
df.isnull().sum()

<a id="preprocesing-two"></a>
### Deal with feature dtype casting

*-> **Casting data types** for necessary features*

In [None]:
df.info()

In [None]:
# change the target values to int64 and not float64
df = df.astype({'target': 'int64'})

<a id="preprocessing-three"></a>
### Deal with feature dropping

*-> **Drop the enrolle_id column** since it is merely an identification for rows*

In [None]:
df.drop("enrollee_id", axis=1, inplace=True)

<a id="preprocessing-four"></a>
### Deal with categorical features

*-> **Encode all of the object or categorical features** to numeric. Refer to the comment for further detail.*

In [None]:
# checking the unique elements of each object or categorical features
# the code below is just to final checking after we do the categorical features encoding
total_columns_after_dropfirst = 0
for col in df.columns:
    if df[col].dtype == 'O':
        print("{} has {} in total of unique values.".format(col, df[col].nunique()))
        total_columns_after_dropfirst += (df[col].nunique()-1)
    else:
        total_columns_after_dropfirst += 1
print("*"*100)
print("In total total columns will be {} in total after one-hot encoding".format(total_columns_after_dropfirst))

In [None]:
# use one-hot encoding (get dummies) and not label encoder because more suitable for machine learning model due to labels are independent to each other.
encoded_object_features = pd.DataFrame(data={},columns=[])
for col in df.select_dtypes(include="object").columns:
    encoded_object_features = pd.concat([encoded_object_features, pd.get_dummies(df[col], drop_first=True)], axis=1) # we need drop first; in a nutshell, to avoid multicolinearity.

In [None]:
# concat the encoded_object_features with numeric features in df
df_encoded = pd.concat([encoded_object_features, df.select_dtypes(include=["int64", "float64"])], axis=1)

In [None]:
# check the final df_encoded columns and make sure the value is the same as we expected in the total_columns_after_dropfirst
len(df_encoded.columns)

<a id="preprocessing-five"></a> 
### Deal with standardization

*-> **Rescaling the data** to normalize all of the features unit*

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# split between features and target. Make sure only rescale the features and not target.
features = df_encoded.drop("target", axis=1)
target = df_encoded["target"]

In [None]:
sc = StandardScaler()
features_scaled = pd.DataFrame(data=sc.fit_transform(features), columns=features.columns)

<a id="classifiers"></a>
## Build machine learning classifiers

*-> We may proceed to **build the model** using machine learning classifier*

In [None]:
# split data between train and test data using features_scaled & target
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.3, random_state=101)

<a id="logisticregression"></a>
## Logistic regression

In [None]:
# using logistic regression as the classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# make predictions
predictions = model.predict(X_test)

In [None]:
print("Model accuracy estimation: {}%".format(round(accuracy_score(y_test, predictions), 2) * 100))

<a id="neuralnetwork"></a>
## Neural network

In [None]:
# using neural network (MLP) as the classifier
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=101, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# make predictions
predictions = model.predict(X_test)

In [None]:
print("Model accuracy estimation: {}%".format(round(accuracy_score(y_test, predictions), 2) * 100))

<a id="knn"></a>
## K Nearest Neighbors

In [None]:
# using K Nearest Neighbors (KNN) as the classifier
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)

In [None]:
# make predictions
predictions = model.predict(X_test)

In [None]:
print("Model accuracy estimation: {}%".format(round(accuracy_score(y_test, predictions), 2) * 100))

In [None]:
# check accuracies based on k values
accuracies = []
for k in range(1, 30):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    accuracies.append(round(accuracy_score(y_test, prediction), 2))
accuracies = np.array(accuracies)

In [None]:
# the best accuracy in KNN
plt.plot([*range(1, 30)], accuracies)
print("Top accuracy is {} when k = {}".format(accuracies[accuracies == accuracies.max()][0], np.where(accuracies == accuracies.max())[0][0]+1))

<a id="conclusion"></a>
## Conclusion

*-> **End of the notebook assignment:** the performance of model could be further improved by adding more preprocessing steps or use different classifier approaches.*