In [9]:
import pandas as pd

df = pd.read_csv("employee_churn_data.csv")
df

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.866070,no
1,operations,0,0.751900,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no
...,...,...,...,...,...,...,...,...,...,...
9535,operations,0,0.610988,4,medium,8.0,0.543641,0,188.155738,yes
9536,logistics,0,0.746887,3,medium,8.0,0.549048,0,188.176164,yes
9537,operations,0,0.557980,3,low,7.0,0.705425,0,186.531008,yes
9538,IT,0,0.584446,4,medium,8.0,0.607287,1,187.641370,yes


In [10]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer as TfidfVec
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import TomekLinks,EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

def create_pipeline():
    
    grid = {
        "n_neighbors": [1, 5, 10, 20],
        "leaf_size": [10, 30],
        "p": [0.1, 1],
        "weights": ['uniform','distance']
    }
    gcv_knb = GridSearchCV(KNeighborsClassifier(), grid, n_jobs=-1)

    grid = {
        "gamma": ["scale", "auto"],
        "kernel" :["linear","rbf"]
    }
    gcv_svc = GridSearchCV(SVC(random_state=0), grid, n_jobs=-1)

    estimators = [('gcv_knb', gcv_knb), ('gcv_svc', gcv_svc)]
    return make_pipeline(
        make_column_transformer(
            (OneHotEncoder(), ["department"]),
            (StandardScaler(), ['avg_hrs_month']),
            (KBinsDiscretizer(n_bins= 5), ["review", "satisfaction", "avg_hrs_month"]),
            (OrdinalEncoder(categories=[['low','medium','high']]), ['salary']),
            remainder="passthrough",
        ),
        # TomekLinks(n_jobs=-1),
        # EditedNearestNeighbours(),
        # SMOTE(n_jobs=-1,random_state=0),
        StackingClassifier(estimators=estimators, stack_method='predict',n_jobs=-1, final_estimator=GradientBoostingClassifier())
        
    )

pipeline = create_pipeline()
pipeline

In [11]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.3, random_state=0, stratify=df['left'])
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=0, stratify=test_data['left'])
train_data.shape[0], valid_data.shape[0], test_data.shape[0]

(6678, 1431, 1431)

## Check Class Imbalance
> มีจำนวน class ไม่เท่ากันเท่าตัว ซึ่งคิดว่าเราควรทำ lower

In [12]:
train_data.left.value_counts(), train_data.left.value_counts(normalize= True)

(left
 no     4729
 yes    1949
 Name: count, dtype: int64,
 left
 no     0.708146
 yes    0.291854
 Name: proportion, dtype: float64)

In [13]:
X_train = train_data.drop('left', axis = 'columns')
y_train = train_data.left
X_train

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month
6623,engineering,0,0.500319,3,high,9.0,0.362436,1,190.679977
3367,sales,0,0.516208,4,medium,5.0,0.700604,0,180.017964
2251,retail,0,0.684981,2,medium,4.0,0.642808,0,176.981070
3966,operations,0,0.833750,3,medium,6.0,0.356837,1,182.414555
539,engineering,1,0.624454,3,medium,7.0,0.475971,0,184.614990
...,...,...,...,...,...,...,...,...,...
8327,operations,0,0.560597,3,medium,8.0,0.522305,0,187.792580
8569,retail,0,0.624207,5,medium,8.0,0.500062,0,187.828921
3031,finance,0,0.620011,4,medium,6.0,0.634181,0,183.640521
9133,support,0,0.746516,3,low,8.0,0.469786,1,189.460255


In [14]:
# from imblearn.under_sampling import RandomUnderSampler
# r = RandomUnderSampler(random_state=42)
# X_train, y_train = r.fit_resample(train_data, train_data.left)
# X_train.drop('left', inplace=True, axis= 1)

In [15]:
y_train

6623     no
3367     no
2251     no
3966    yes
539      no
       ... 
8327    yes
8569    yes
3031     no
9133    yes
3048     no
Name: left, Length: 6678, dtype: object

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
train_score = pipeline.score(X_train, y_train)
print(f"{train_score:,.3f}")

0.859


In [18]:
pipeline.steps[-1][1].n_features_in_

31

In [19]:
X_test = test_data.drop(['left'], axis = 1)
y_test = test_data['left']
pipeline.score(X_test, y_test)

0.8483577917540182

In [20]:
import joblib

joblib.dump(pipeline, "pipeline.joblib")
joblib.dump(list(df['salary'].unique()), 'salary.joblib')
joblib.dump(list(df['department'].unique()), "departments.joblib")

['departments.joblib']

In [21]:
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
departments = joblib.load("departments.joblib")
salary = joblib.load("salary.joblib")
pipeline

In [22]:
%%writefile app.py
# !pip install gradio ipywidgets
import pandas as pd
import gradio as gr
import joblib

# "Artifacts"
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
departments_list = joblib.load("departments.joblib")
salary_list = joblib.load("salary.joblib")
boolean_dict = {'Yes':1,'No':0}


def predict(department, promoted, review, projects, salary, tenure, satisfaction, bonus, avg_hrs_month):
    sample = dict()
    sample['department'] = department
    sample['promoted'] = boolean_dict[promoted]
    sample['review'] = review
    sample['projects'] = projects
    sample['salary'] = salary
    sample['tenure'] = tenure
    sample['satisfaction'] = satisfaction
    sample['bonus'] = boolean_dict[bonus]
    sample['avg_hrs_month'] = avg_hrs_month

    left = pipeline.predict(pd.DataFrame([sample]))
    print(left)
    return left

# https://www.gradio.app/guides
with gr.Blocks() as blocks:
    department = gr.Dropdown(departments_list, value=departments_list[0], label="Department")
    promoted = gr.Radio(['Yes','No'],label = "promoted",info = "is they promoted?")
    review = gr.Number(label="Review", value=0.0, minimum=0.0, maximum= 1.0,step=0.1,info="range(0-1)")
    projects = gr.Number(label="Projects", value=1, minimum=0, step=1)
    salary = gr.Dropdown(salary_list, value=salary_list[0], label="salary")
    tenure = gr.Number(label="Tenure", value=1, minimum=0)
    satisfaction = gr.Number(label="Satisfaction", value=0.0, minimum=0.0, maximum= 1.0,step=0.1,info="range(0-1)")
    bonus = gr.Radio(['Yes','No'],label = "bonus",info = "is they get bonus?")
    avg_hrs_month = gr.Number(label="Average hours/month", minimum=0)
    predict_btn = gr.Button("Predict")
    left = gr.Text(label="left")

    inputs = [department, promoted, review, projects, salary, tenure, satisfaction, bonus, avg_hrs_month]
    outputs = left

    predict_btn.click(predict, inputs=inputs, outputs=outputs)
    print(outputs)

    

if __name__ == "__main__":
    blocks.launch() # Local machine only
    # blocks.launch(server_name="0.0.0.0") # LAN access toci local machine
    # blocks.launch(share=True) # Public access to local machine

Overwriting app.py


In [23]:
%run app.py

textbox
Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


# **Preformance**

In [24]:
from sklearn.model_selection import cross_val_score

score_list = cross_val_score(pipeline,df.drop(columns=['left']),df['left'],cv=5,n_jobs=-1)
score_list

array([0.80083857, 0.79402516, 0.83595388, 0.85953878, 0.87368973])

In [25]:
score_list.mean()
score_list.std()
print(f"mean score : {score_list.mean():.3f} +- {score_list.std():.3f}")

mean score : 0.833 +- 0.031


In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pipeline.predict(X_test)))


              precision    recall  f1-score   support

          no       0.86      0.93      0.90      1014
         yes       0.80      0.65      0.71       417

    accuracy                           0.85      1431
   macro avg       0.83      0.79      0.80      1431
weighted avg       0.84      0.85      0.84      1431

