In [1]:
import pandas as pd

df = pd.read_csv("employee_churn_data.csv")
df

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.866070,no
1,operations,0,0.751900,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no
...,...,...,...,...,...,...,...,...,...,...
9535,operations,0,0.610988,4,medium,8.0,0.543641,0,188.155738,yes
9536,logistics,0,0.746887,3,medium,8.0,0.549048,0,188.176164,yes
9537,operations,0,0.557980,3,low,7.0,0.705425,0,186.531008,yes
9538,IT,0,0.584446,4,medium,8.0,0.607287,1,187.641370,yes


In [2]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.3, random_state=0, stratify=df['left'])
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=0, stratify=test_data['left'])
train_data.shape[0], valid_data.shape[0], test_data.shape[0]

(6678, 1431, 1431)

In [3]:
X_train = train_data.drop(columns= 'left', axis = 'columns')
y_train = train_data['left']


In [4]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer as TfidfVec
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

def create_pipeline():
    
    grid = {
        "n_neighbors": [1, 5, 10, 20],
        "algorithm": ['auto'],
        "leaf_size": [10],
        "p": [0.01, 0.1],
    }
    gcv_knb = GridSearchCV(KNeighborsClassifier(), grid, n_jobs=-1)

    grid = {
        "gamma": ["scale", "auto"],
    }
    gcv_svc = GridSearchCV(SVC(), grid, n_jobs=-1)

    # estimators = [('gcv_svc', gcv_svc)]
    estimators = [('gcv_knb', gcv_knb), ('gcv_svc', gcv_svc)]
    return make_pipeline(
        make_column_transformer(
            (OneHotEncoder(), ["department"]),
            (StandardScaler(), ['avg_hrs_month']),
            (KBinsDiscretizer(n_bins= 5), ["review", "satisfaction", "avg_hrs_month"]),
            (OrdinalEncoder(), ['salary']),
            remainder="passthrough",
        ),
        StackingClassifier(estimators=estimators, stack_method='predict')
    )

pipeline = create_pipeline()
pipeline

In [5]:
pipeline.fit(X_train, y_train)

In [6]:
train_score = pipeline.score(X_train, y_train)
print(f"{train_score:,.3f}")

0.856


In [7]:
pipeline.steps[-1][1].n_features_in_

31

In [8]:
X_test = test_data.drop('left', axis = 1)
y_test = test_data['left']
pipeline.score(X_test, y_test)

0.8490566037735849

In [9]:
import joblib

joblib.dump(pipeline, "pipeline.joblib")
joblib.dump(list(df['salary'].unique()), 'salary.joblib')
joblib.dump(list(df['department'].unique()), "departments.joblib")

['departments.joblib']

In [10]:
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
departments = joblib.load("departments.joblib")
salary = joblib.load("salary.joblib")
pipeline

In [39]:
%%writefile app.py
# !pip install gradio ipywidgets
import pandas as pd
import gradio as gr
import joblib

# "Artifacts"
pipeline = joblib.load("pipeline.joblib")
# label_pipeline = joblib.load("label_pipeline.joblib")
departments_list = joblib.load("departments.joblib")
salary_list = joblib.load("salary.joblib")
boolean_dict = {'Yes':1,'No':0}


def predict(department, promoted, review, projects, salary, tenure, satisfaction, bonus, avg_hrs_month):
    sample = dict()
    sample['department'] = department
    sample['promoted'] = boolean_dict[promoted]
    sample['review'] = review
    sample['projects'] = projects
    sample['salary'] = salary
    sample['tenure'] = tenure
    sample['satisfaction'] = satisfaction
    sample['bonus'] = boolean_dict[bonus]
    sample['avg_hrs_month'] = avg_hrs_month

    left = pipeline.predict(pd.DataFrame([sample]))
    return left

# https://www.gradio.app/guides
with gr.Blocks() as blocks:
    department = gr.Dropdown(departments_list, value=departments_list[0], label="Department")
    promoted = gr.Radio(['Yes','No'],label = "promoted",info = "is they promoted?")
    review = gr.Number(label="Review", value=0.0, minimum=0.0, maximum= 1.0,step=0.1,info="range(0-1)")
    projects = gr.Number(label="Projects", value=1, minimum=0, step=1)
    salary = gr.Dropdown(salary_list, value=salary_list[0], label="salary")
    tenure = gr.Number(label="Tenure", value=1, minimum=0)
    satisfaction = gr.Number(label="Satisfaciton", value=0.0, minimum=0.0, maximum= 1.0,step=0.1,info="range(0-1)")
    bonus = gr.Radio(['Yes','No'],label = "bonus",info = "is they get bonus?")
    avg_hrs_month = gr.Number(label="Average hours/month", minimum=0)
    left = gr.Text(label="left")

    inputs = [department, promoted, review, projects, salary, tenure, satisfaction, bonus, avg_hrs_month]
    outputs = left

    predict_btn = gr.Button("Predict")
    predict_btn.click(predict, inputs=inputs, outputs=outputs)

    

if __name__ == "__main__":
    blocks.launch() # Local machine only
    # blocks.launch(server_name="0.0.0.0") # LAN access toci local machine
    # blocks.launch(share=True) # Public access to local machine

Overwriting app.py


In [40]:
%run app.py

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.
