### **Importing Data**

In [37]:
## Importing Required Libraries
import pandas as pd

## Reading the dataset
df = pd.read_csv('Employee.csv')
print(df.head())

   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  


In [38]:
# ## CHECK AND DROP DUPLICATES AND RECALL MORE DATA CLEANING STEPS WHILE REQUIRED (DONT DROP NA)
# df2 = df.duplicated()
# print("before duplicate drop:",df.duplicated().count())

# # print(df[df2].count()) # 1889 duplicate found
# # df[df['default']=='unknown']
# df.drop_duplicates(inplace=True)
# print("after duplicate drop:",df.duplicated().count())

In [39]:
df.info() # checking null , total , dtypes, col_name,data shape
# df.isnull().sum() # no null value found!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [40]:
df['LeaveOrNot'] = df['LeaveOrNot'].map({1: 'Leave', 0: 'Stay'})

In [42]:
# split data into feature and target

X = df.drop(columns='LeaveOrNot')
target_feature = pd.DataFrame(df['LeaveOrNot'])
target_feature


Unnamed: 0,LeaveOrNot
0,Stay
1,Leave
2,Stay
3,Leave
4,Leave
...,...
4648,Stay
4649,Leave
4650,Leave
4651,Stay


In [44]:
cat_feat = X.select_dtypes(include=[object]).columns
cat_feat = list(cat_feat.difference(['LeaveOrNot'])) 
print(f'{cat_feat}')

['City', 'Education', 'EverBenched', 'Gender']


In [45]:
num_feat = X.select_dtypes(exclude=[object]).columns
num_feat = list(num_feat.difference(['LeaveOrNot']))
print(f'{num_feat}')

['Age', 'ExperienceInCurrentDomain', 'JoiningYear', 'PaymentTier']


In [46]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets 80-20
X_train, X_test, y_train, y_test = train_test_split(X, target_feature, test_size=0.2, random_state=42)

In [47]:
print('\n','Data Shape',X_train.shape,X_test.shape,'\n','Train_data', y_train.value_counts(normalize=True),'\n','Test_data', y_test.value_counts(normalize=True))


 Data Shape (3722, 8) (931, 8) 
 Train_data LeaveOrNot
Stay          0.656368
Leave         0.343632
Name: proportion, dtype: float64 
 Test_data LeaveOrNot
Stay          0.655209
Leave         0.344791
Name: proportion, dtype: float64


EXPORTING FEATURE INPUT METADATA

In [49]:
def summarize_cat(data,categorical_features):
  results=[]

  for column in data[categorical_features]:
      # Get the unique members of the column
      members = data[column].unique().tolist()
      # Append the column name and its unique members to the results list
      results.append([column, members])

  return pd.DataFrame(results, columns=['Column Name', 'Members'])

# Create a DataFrame from the results list
summarize_cat(X_train,cat_feat)

Unnamed: 0,Column Name,Members
0,City,"[New Delhi, Bangalore, Pune]"
1,Education,"[Masters, Bachelors, PHD]"
2,EverBenched,"[No, Yes]"
3,Gender,"[Male, Female]"


In [50]:
summarize_cat(X_train,cat_feat).to_dict()# EXPORTING FOR DE

my_feature_dict = {'CATEGORICAL' : summarize_cat(df,cat_feat).to_dict(), 'NUMERICAL' : {'Column Name': num_feat}}

my_feature_dict

{'CATEGORICAL': {'Column Name': {0: 'City',
   1: 'Education',
   2: 'EverBenched',
   3: 'Gender'},
  'Members': {0: ['Bangalore', 'Pune', 'New Delhi'],
   1: ['Bachelors', 'Masters', 'PHD'],
   2: ['No', 'Yes'],
   3: ['Male', 'Female']}},
 'NUMERICAL': {'Column Name': ['Age',
   'ExperienceInCurrentDomain',
   'JoiningYear',
   'PaymentTier']}}

In [51]:
import pickle

# save dictionary to person_data.pkl file
with open('my_feature_dict.pkl', 'wb') as fp:
    pickle.dump(my_feature_dict, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


CREATING THE PIPELINE


In [58]:
from sklearn.pipeline import Pipeline

# PREPROCESSING TRANSFORMATIONS ARE DONE ON EXAMPLE BASIS
# REAL WORLD SELECTION OF PREPROCSSING TRANSFORMATIONS MUST BE LOGICAL

# transform_leaveOrNot = lambda x: x.assign( LeaveOrNot=x['LeaveOrNot'].map({1: 'Yes', 0: 'No'}))

# from sklearn.preprocessing import FunctionTransformer

# preprocessor_stage_1 = Pipeline(steps=[
#     ('transform_sc', FunctionTransformer(transform_leaveOrNot)),
# ])

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

pipeline_num = Pipeline(steps=[
    ('scale_data', StandardScaler()),
    ('simple_imputer1', SimpleImputer(strategy='constant',fill_value=0)),
])

from sklearn.preprocessing import OneHotEncoder

pipeline_cat = Pipeline(steps=[
    ('OneHotEncode', OneHotEncoder(handle_unknown="ignore"))
])

from sklearn.compose import ColumnTransformer

preprocessor_stage_1 = ColumnTransformer(
    transformers=[
        ('cat', pipeline_cat, cat_feat),  # Categorical columns
        ('num', pipeline_num, num_feat),     # Numerical columns
    ],remainder='drop') # BECAUSE WE DIDN'T SPECIFY CUSTOMERID IN ANY OF CATEGORICAL OR NUMERICAL FEATURES (REMAINDER='drop') REMOVE IT OUT OF PIPELINE

preprocessor_stack = Pipeline(steps=[
    
    ('preprocessor_stage_1', preprocessor_stage_1)
])



In [59]:
preprocessor_stack

In [60]:
preprocessor_stack.fit(X_train)

In [61]:
pd.DataFrame(preprocessor_stack.transform(X_train),columns=preprocessor_stack[-1].get_feature_names_out())

Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_No,cat__EverBenched_Yes,cat__Gender_Female,cat__Gender_Male,num__Age,num__ExperienceInCurrentDomain,num__JoiningYear,num__PaymentTier
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.117526,-1.861550,-1.109740,0.539424
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.914641,0.061305,-1.645675,0.539424
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.088907,-0.579646,1.034001,-1.226396
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-1.121074,-0.579646,-1.645675,0.539424
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.149693,-0.579646,1.034001,-1.226396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3717,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.530393,-1.220598,-0.573805,0.539424
3718,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.708208,0.702257,-1.109740,0.539424
3719,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.975427,-1.220598,0.498065,0.539424
3720,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.388294,-1.220598,-0.037870,0.539424


In [62]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_stack),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [63]:
# Checking Training Accuracy
y_train_pred = pipeline.predict(X_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_train,y_train_pred))
print("\nClassification Report:\n", classification_report(y_train,y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train,y_train_pred))

Accuracy: 0.9279957012358947

Classification Report:
               precision    recall  f1-score   support

       Leave       0.95      0.84      0.89      1279
        Stay       0.92      0.98      0.95      2443

    accuracy                           0.93      3722
   macro avg       0.93      0.91      0.92      3722
weighted avg       0.93      0.93      0.93      3722


Confusion Matrix:
 [[1069  210]
 [  58 2385]]


In [73]:
# CREATING A TEST

my_pred_array=X_test.iloc[16:17:]

my_pred_array

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
230,Bachelors,2014,Pune,3,24,Male,No,2


In [75]:
pd.DataFrame(preprocessor_stack.transform(my_pred_array),columns=preprocessor_stack[0].get_feature_names_out())

Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_No,cat__EverBenched_Yes,cat__Gender_Female,cat__Gender_Male,num__Age,num__ExperienceInCurrentDomain,num__JoiningYear,num__PaymentTier
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.121074,-0.579646,-0.573805,0.539424


In [76]:
# USING PIPELINE TO DO ALL TOGHETHER (PREPROCESSING FOLLOWED BY MODEL PREDICT)

# SINGLE PREDICTION

y_pred = pipeline.predict(my_pred_array)

y_pred

array(['Stay'], dtype=object)

In [None]:
# !pip install dill

Collecting dill
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.4.0-py3-none-any.whl (119 kB)
   ---------------------------------------- 0.0/119.7 kB ? eta -:--:--
   ---------- ----------------------------- 30.7/119.7 kB 1.4 MB/s eta 0:00:01
   ------------- ------------------------- 41.0/119.7 kB 393.8 kB/s eta 0:00:01
   ----------------------------------- -- 112.6/119.7 kB 939.4 kB/s eta 0:00:01
   -------------------------------------- 119.7/119.7 kB 781.6 kB/s eta 0:00:00
Installing collected packages: dill
Successfully installed dill-0.4.0



[notice] A new release of pip is available: 24.1.1 -> 25.1.1
[notice] To update, run: C:\Users\sarfaraz\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [72]:
import dill

# save trained pipeline file

with open('pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)

print('pipeline saved successfully to file')

pipeline saved successfully to file
