### **Importing Data**

In [32]:
from google.colab import files
uploades = files.upload()

Saving Employee.csv to Employee (1).csv


In [1]:
import pandas as pd

data = pd.read_csv('Employee.csv')
print(data.columns)
print(data.head())


Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
      dtype='object')
   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  


In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### **Data Cleaning**

In [3]:
data.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,4653.0,4653.0,4653.0,4653.0,4653.0
mean,2015.06297,2.698259,29.393295,2.905652,0.343864
std,1.863377,0.561435,4.826087,1.55824,0.475047
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [5]:
data['LeaveOrNot'] = data['LeaveOrNot'].map({1: 'Yes', 0: 'No'})

In [6]:
# Splitting features and target variable
X = data.drop(columns=['LeaveOrNot'])
y = data['LeaveOrNot']

categorical_features = X.select_dtypes(include=['object']).columns
categorical_features = list(categorical_features.difference(['LeaveOrNot']))
print('\n', 'Categorical Features', '\n', categorical_features, '\n')

numerical_features = list(X.select_dtypes(include=[np.float64, np.int64]))
print('\n', 'Numerical Features', '\n', numerical_features, '\n')


 Categorical Features 
 ['City', 'Education', 'EverBenched', 'Gender'] 


 Numerical Features 
 ['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain'] 



In [7]:
from sklearn.model_selection import train_test_split

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train Data', '\n', y_train.value_counts(normalize=True), '\n', '\n', 'Test Data', '\n', y_test.value_counts(normalize=True))


Train Data 
 LeaveOrNot
No     0.656368
Yes    0.343632
Name: proportion, dtype: float64 
 
 Test Data 
 LeaveOrNot
No     0.655209
Yes    0.344791
Name: proportion, dtype: float64


In [8]:
def summarize_cat(data, categorical_features):
    results = []

    for column in data[categorical_features]:
        # Get the unique members of the column
        members = data[column].unique().tolist()

        # Append the column name and its unique members to the results list
        results.append([column, members])

    return pd.DataFrame(results, columns=['Column Name', 'Members'])

# Create a DataFrame from the results list
summarize_cat(X_train, categorical_features)

Unnamed: 0,Column Name,Members
0,City,"[New Delhi, Bangalore, Pune]"
1,Education,"[Masters, Bachelors, PHD]"
2,EverBenched,"[No, Yes]"
3,Gender,"[Male, Female]"


In [9]:
# EXPORTING FOR DE
my_feature_dict = {'CATEGORICAL' : summarize_cat(data, categorical_features).to_dict(),
                  'NUMERICAL' : {'Column Name' : numerical_features}}
my_feature_dict

{'CATEGORICAL': {'Column Name': {0: 'City',
   1: 'Education',
   2: 'EverBenched',
   3: 'Gender'},
  'Members': {0: ['Bangalore', 'Pune', 'New Delhi'],
   1: ['Bachelors', 'Masters', 'PHD'],
   2: ['No', 'Yes'],
   3: ['Male', 'Female']}},
 'NUMERICAL': {'Column Name': ['JoiningYear',
   'PaymentTier',
   'Age',
   'ExperienceInCurrentDomain']}}

In [10]:
import pickle

# save dictionary to person_data.pkl file
with open('my_feature_dict.pkl', 'wb') as fp:
    pickle.dump(my_feature_dict, fp)
    print('dictionary saved successfully to file')

# dictionary saved successfully to file

dictionary saved successfully to file


In [11]:
from sklearn.pipeline import Pipeline

# PREPROCESSING TRANSFORMATIONS ARE DONE ON EXAMPLE BASIS
# REAL WORLD SELECTION OF PREPROCSSING TRANSFORMATIONS MUST BE LOGICAL

# transform_senior_citizen = lambda x: x.assign(SENIORCITIZEN=x['SENIORCITIZEN'].map({1: 'Yes', 0: 'No'}))

from sklearn.preprocessing import FunctionTransformer

# preprocessor_stage_1 = Pipeline(steps=[
#     ('transform_sc', FunctionTransformer(transform_senior_citizen)),
# ])

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

pipeline_num = Pipeline(steps=[
    ('scale_data', StandardScaler()),
    ('simple_imputer1', SimpleImputer(strategy='constant',fill_value=0)),
])

from sklearn.preprocessing import OneHotEncoder

pipeline_cat = Pipeline(steps=[
    ('OneHotEncode', OneHotEncoder(handle_unknown="ignore"))
])

from sklearn.compose import ColumnTransformer

preprocessor_stage_2 = ColumnTransformer(
    transformers=[
        ('cat', pipeline_cat, categorical_features),  # Categorical columns
        ('num', pipeline_num, numerical_features),     # Numerical columns
    ],remainder='drop')

preprocessor_stack = Pipeline(steps=[
    # ('preprocessor_stage_1', preprocessor_stage_1),
    ('preprocessor_stage_2', preprocessor_stage_2)
])

preprocessor_stack

# BECAUSE WE DIDN'T SPECIFY CUSTOMERID IN ANY OF CATEGORICAL OR NUMERICAL FEATURES (REMAINDER='drop') REMOVE IT OUT OF PIPELINE

In [12]:
preprocessor_stack.fit(X_train)


In [13]:

pd.DataFrame(preprocessor_stack.transform(X_train),columns=preprocessor_stack[-1].get_feature_names_out())

Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_No,cat__EverBenched_Yes,cat__Gender_Female,cat__Gender_Male,num__JoiningYear,num__PaymentTier,num__Age,num__ExperienceInCurrentDomain
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-1.109740,0.539424,0.117526,-1.861550
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.645675,0.539424,-0.914641,0.061305
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.034001,-1.226396,-0.088907,-0.579646
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-1.645675,0.539424,-1.121074,-0.579646
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.034001,-1.226396,1.149693,-0.579646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3717,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.573805,0.539424,0.530393,-1.220598
3718,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.109740,0.539424,-0.708208,0.702257
3719,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.498065,0.539424,1.975427,-1.220598
3720,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.037870,0.539424,2.388294,-1.220598


In [14]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_stack),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [15]:
# Checking Training Accuracy
y_train_pred = pipeline.predict(X_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:",  accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

Accuracy: 0.9277270284793122

Classification Report:
               precision    recall  f1-score   support

          No       0.92      0.97      0.95      2443
         Yes       0.94      0.84      0.89      1279

    accuracy                           0.93      3722
   macro avg       0.93      0.91      0.92      3722
weighted avg       0.93      0.93      0.93      3722


Confusion Matrix:
 [[2378   65]
 [ 204 1075]]


In [16]:
# CREATING A TEST
my_pred_array=X_test.iloc[15:16:]
my_pred_array

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
1128,Bachelors,2015,Pune,2,26,Female,No,4


In [17]:
pd.DataFrame(preprocessor_stack.transform(my_pred_array),columns=preprocessor_stack[0].get_feature_names_out())


Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_No,cat__EverBenched_Yes,cat__Gender_Female,cat__Gender_Male,num__JoiningYear,num__PaymentTier,num__Age,num__ExperienceInCurrentDomain
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.03787,-1.226396,-0.708208,0.702257


In [18]:
# USING PIPELINE TO DO ALL TOGETHER (PREPROCESSING FOLLOWED BY MODEL PREDICT)
# SINGLE PREDICTION
y_pred = pipeline.predict(my_pred_array)
y_pred

array(['Yes'], dtype=object)

In [19]:
!pip install dill



In [20]:
import dill

# save trained pipeline file
with open('pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)

print('pipeline saved successfully to file')

pipeline saved successfully to file


In [21]:

!pip install -q streamlit

In [24]:
%%writefile app.py
import streamlit as st
import pandas as pd
from joblib import load
import dill

# Load the pretrained model
with open('pipeline.pkl', 'rb') as file:
    model = dill.load(file)

my_feature_dict = load('my_feature_dict.pkl')

# Function to predict churn
def predict_churn(data):
    prediction = model.predict(data)
    return prediction

st.title('Employee Churn Prediction App')
st.subheader('Based on Employee Dataset')

# Display categorical features
st.subheader('Categorical Features')
categorical_input = my_feature_dict.get('CATEGORICAL')
categorical_input_vals = {}
for i, col in enumerate(categorical_input.get('Column Name').values()):
    categorical_input_vals[col] = st.selectbox(col, categorical_input.get('Members')[i])

# Load numerical features
numerical_input = my_feature_dict.get('NUMERICAL')

# Display numerical features
st.subheader('Numerical Features')
numerical_input = my_feature_dict.get('NUMERICAL')
numerical_input_vals = {}
for col in numerical_input.get('Column Name'):
    numerical_input_vals[col] = st.number_input(col)

# Combine numerical and categorical input dicts
input_data = dict(list(categorical_input_vals.items()) + list(numerical_input_vals.items()))

input_data= pd.DataFrame.from_dict(input_data, orient='index').T

# Churn Prediction
if st.button('Predict'):
    prediction = predict_churn(input_data)[0]
    translation_dict = {'Yes':'Expected','No':'Not Expected'}
    prediction_translate = translation_dict.get(prediction)
    st.write(f'The Prediction is **{prediction}**, Hence Employee is **{prediction_translate}** to churn.')

st.subheader('Created by Muhammad Talal Saeed')




Overwriting app.py


In [26]:
!pip install streamlit



In [None]:
!streamlit run app.py 

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.195.218:8501[0m
[0m


In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64

# import subprocess
# subprocess.Popen(["./cloudflared-linux-amd64", "tunnel", "--url", "http://localhost:8501"])

!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &

In [None]:
!grep -o 'https://.*.trycloudflare.com' nohup.out | head -n 1 | xargs -I {} echo "Your tunnel url {}"


In [None]:

!streamlit run /content/app.py &> /content/logs.txt &

In [None]:
import os
import streamlit as st

st.write(os.getcwd())

In [None]:
!pip freeze