#### Problem statements
The client wants a system that will predict the employees that are likely to leave  or stay in the company

#### Their expectations
1. They want REST API to train the model using training batch dataset
2. They also want REST API ti preduct the result using predictio training batch dataset

#### Solution Design
1. The client is expected to provide their data which may possibly be in multiple files in one directory. This data is called raw data. Then we will build a schema file which describes the number of columns and the name of each columns with their data type that are available.
2. Data validation and transformation using python class to read the data. 
3. Once successfully validated, it will be stored into a database with python class
4. EDA with python class
5. Model Selection
6. Clustering and model building: Kmeans clustering is used for the clustering. The idea behind the clustering is to implement an algorith to train the data in different clusters, helping to find the best model for each cluster.
7. Using grid-search and hyper-parameter tuning to get the best model. The best model will saved for each cluster.
8. Prediction: The client will provide another data for prediction which will be validated and stored like the training data in a new database.
9. Model Call
10. Deployment

### Import important libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split, StratifiedKFold, KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('train_data.csv')
data.head(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Target_variable,dept_RandD,dept_accounting,dept_hr,dept_management,dept_marketing,dept_product_mng,dept_sales,dept_support,dept_technical,salary_low,salary_medium
0,0.52,0.38,2,103,3,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1.0,0.61,3,188,4,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0.75,0.82,4,252,3,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,0.42,0.53,2,132,3,1,0,1,0,0,0,0,0,0,1,0,0,1,0
4,0.93,0.59,3,202,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [3]:
data.shape

(10499, 19)

In [4]:
data.describe(include = 'all')

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Target_variable,dept_RandD,dept_accounting,dept_hr,dept_management,dept_marketing,dept_product_mng,dept_sales,dept_support,dept_technical,salary_low,salary_medium
count,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0
mean,0.613943,0.717405,3.800552,201.143156,3.493857,0.145442,0.02124,0.237832,0.052957,0.051338,0.048862,0.041052,0.056577,0.059053,0.275455,0.150586,0.183922,0.485761,0.432327
std,0.248494,0.17145,1.232628,49.786792,1.458917,0.352563,0.144191,0.425776,0.223959,0.220697,0.215589,0.198419,0.231043,0.235736,0.446764,0.357662,0.387439,0.499821,0.495423
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.65,0.72,4.0,200.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
X = data.drop('Target_variable', axis=1)
y= data['Target_variable']

In [6]:
# To check the balancing of the dataset
y.value_counts(normalize=True)

0    0.762168
1    0.237832
Name: Target_variable, dtype: float64

In [7]:
# oversample the underclass
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 1)
X_resampled, y_resampled = sm.fit_resample(X,y)

In [8]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [9]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [10]:
X_train=scaler.fit_transform(x_train)
X_test=scaler.transform(x_test)

In [11]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

In [14]:
models = [] # create empty list for models

models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('XGB', XGBRFClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GB', GaussianNB()))

#results of performance
results = []

#evaluation metric
scoring = 'roc_auc'

# name of models
names = []

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=7, shuffle=True) # cross validation
    cv_results = cross_val_score(model, X_resampled, y_resampled, cv=kfold, scoring=scoring)
    #results.append(cv_results)
    names.append(name)
    
    
    print(f"{name}: roc- {cv_results.mean()} std-error:{cv_results.std()}")

LR: roc- 0.8451281103698502 std-error:0.012505047892961632
RF: roc- 0.9974733905859863 std-error:0.0011702536812726663
SVM: roc- 0.7626182404806492 std-error:0.008528687327694198
KNN: roc- 0.9771216117548376 std-error:0.0029870636724466056
XGB: roc- 0.9879363378784332 std-error:0.0029297857495318054
DT: roc- 0.9660717072409488 std-error:0.005159034585540766
GB: roc- 0.8312507291666666 std-error:0.011581175756394436


In [None]:
# RF, KNN, and XGB will be selected for the modelling

#### RF

In [12]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

param_dict = {
    "n_estimators":[50, 70, 90, 100],
    "criterion":["gini", "entropy", "log_loss"],
    "max_depth":[10, 15, 20],
    "max_features":['auto', 'sqrt'],
    "min_samples_split":[4, 6],
    "bootstrap":[True, False],
    "random_state":[1,2]
}

# we'll use a 5-fold cross-validation

grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=False), param_grid=param_dict, cv=3, verbose=True)
# grid_search = GridSearchCV(rf, param_dict, cv=3, scoring='f1', n_jobs=-1, return_train_score=True, verbose=True)

start = time.time()

grid_search_model=grid_search.fit(x_train, y_train)

duration  = (time.time() - start)

print(f"{grid_search.best_score_} took {duration} seconds")

print(f" The best param is: {grid_search.best_params_}")

print(f" The best estimator is: \n{grid_search_model.best_estimator_}")

Fitting 3 folds for each of 576 candidates, totalling 1728 fits


KeyboardInterrupt: 

In [13]:
# Applying the best parameters
new_rf = RandomForestClassifier(n_estimators = 50, oob_score = False, n_jobs = -1, criterion = 'gini',
                              max_features = "auto", min_samples_split = 4, max_depth=20, random_state=1)
new_rf.fit(x_train, y_train)
print(f"Train score: {new_rf.score(x_train, y_train)} --- Test score: {new_rf.score(x_test, y_test)}")

Train score: 0.9964851987815356 --- Test score: 0.979381443298969


In [15]:
pickle.dump(new_rf,open('regmodel.pkl','wb'))

In [20]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))

ValueError: node array from the pickle has an incompatible dtype:
- expected: [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]
- got     : {'names':['left_child','right_child','feature','threshold','impurity','n_node_samples','weighted_n_node_samples','missing_go_to_left'], 'formats':['<i8','<i8','<i8','<f8','<f8','<i8','<f8','u1'], 'offsets':[0,8,16,24,32,40,48,56], 'itemsize':64}

In [16]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'Target_variable', 'dept_RandD',
       'dept_accounting', 'dept_hr', 'dept_management', 'dept_marketing',
       'dept_product_mng', 'dept_sales', 'dept_support', 'dept_technical',
       'salary_low', 'salary_medium'],
      dtype='object')

In [18]:
datat= pd.read_csv('test_data.csv')
datat.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Target_variable,dept_RandD,dept_accounting,dept_hr,dept_management,dept_marketing,dept_product_mng,dept_sales,dept_support,dept_technical,salary_low,salary_medium
0,0.11,0.8,6,285,4,0,0,1,0,1,0,0,0,0,0,0,0,0,1
1,0.61,0.62,4,269,4,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0.78,0.65,3,139,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0.52,0.77,4,134,4,0,0,0,0,0,0,0,0,0,0,0,1,1,0
4,0.28,0.55,4,208,4,0,0,0,0,0,0,0,1,0,0,0,0,0,1
5,0.54,0.73,3,157,3,0,0,0,1,0,0,0,0,0,0,0,0,1,0
6,0.53,0.56,5,236,4,1,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0.6,0.61,5,191,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0.4,0.51,2,136,3,0,0,1,0,0,0,0,0,1,0,0,0,1,0
9,0.77,0.92,5,255,5,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [19]:
pickled_model.predict(scaler.transform(datat[0].reshape(1,-1)))
pickled_model

NameError: name 'pickled_model' is not defined