In [None]:
!pip install --q imbalanced-learn
!pip install fosforml 
!pip install fosforio
!pip install seaborn

In [1]:
# Statitics and Basic data support
from scipy.stats.mstats import winsorize
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling Training and Tunning
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score, recall_score, f1_score,log_loss, roc_auc_score,roc_curve, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Imbalance Handling
from imblearn.over_sampling import SMOTE

# Addititional
from collections import Counter
import joblib

# Fosforml
from fosforml import register_model
from fosforml.constants import MLModelFlavours
from fosforml.model_manager.snowflakesession import get_session

# Warnings
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [3]:
df = 'EMPLOYEE_DATASET'

In [4]:
sf_df = my_session.sql("select * from {}".format(df))

In [5]:
df = sf_df.to_pandas()

In [6]:
print(df.isnull().sum())

BIRTH_YEAR                             0
AGE                                    0
QUALIFICATION                          0
ETHNICITY                              0
MARITAL_STATUS                         0
GENDER                                 0
CONTINENT                              0
COUNTRY                                0
STATE                                  0
CITY                                   0
LATITUDE                               0
LONGITUDE                              0
EMPLOYEE_ID                            0
DISTANCE                               0
ROLE                                   0
LINE_OF_BUSINESS                       0
DELIVERY_UNIT                          0
PRACTICE_UNIT                          0
EMPLOYMENT_TYPE                        0
TURNOVER_REASONS                       0
SHIFT                                  0
SALARY_INR                             0
SALARY_RANGE                           0
SALARY_LEVELS                          0
JOB_SATISFACTION

In [7]:
Original_df = df

In [8]:
Original_df

Unnamed: 0,BIRTH_YEAR,AGE,QUALIFICATION,ETHNICITY,MARITAL_STATUS,GENDER,CONTINENT,COUNTRY,STATE,CITY,...,SALARY_LEVELS,JOB_SATISFACTION,AVERAGE_PERCENTAGE_SALARY_HIKE,AVERAGE_PERFORMANCE_RATING,OVER_TIME,OVERTIME_HOURS,JOB_STARTDATE,JOB_ENDDATE,CHURN,TENURE_MONTHS
0,1985,39,Masters Degree,Asian,Single,Female,Asia-Pacific,Australia,Victoria,Melbourne,...,Lead/Managerial Level,High,13,1,No,4,2005-02-15,2012-02-03,1,84
1,1963,61,Associates Degree,Caucasian,Single,Male,North America,Canada,Ontario,Mississauga,...,Lead/Managerial Level,Low,10,1,No,1,2009-04-02,2024-04-16,1,180
2,1993,31,Professional Degree,African American,Single,Male,North America,United States,Georgia,Alpharetta,...,Executive Level,High,15,1,No,7,2018-06-18,,0,75
3,1969,55,Associates Degree,Caucasian,Married,Female,Asia-Pacific,Singapore,Singapore,Singapore City,...,Senior Level,Medium,11,1,No,16,1997-09-04,,0,324
4,1997,27,Associates Degree,African,Married,Female,Middle East and Africa,South Africa,Johannesburg,Johannesburg City,...,Executive Level,Low,11,3,Yes,4,2016-07-30,,0,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1970,54,Associates Degree,Asian,Married,Male,Europe,Europe,France,Puteaux,...,Senior Level,High,12,1,No,14,2004-12-05,2009-02-08,1,50
299996,1982,42,Bachelors Degree,Asian,Married,Female,Asia-Pacific,India,Karnataka,Bangalore,...,Senior Level,High,9,4,Yes,14,2014-08-01,2018-05-12,1,45
299997,1998,26,Doctoral Degree,African American,Married,Female,North America,Mexico,Jalisco,Guadalajara,...,Senior Level,Very High,7,3,No,14,2023-02-17,,0,19
299998,1964,60,Professional Degree,African,Married,Female,Middle East and Africa,South Africa,Johannesburg,Johannesburg City,...,Executive Level,Medium,15,3,No,14,2003-04-04,2005-01-29,1,21


In [9]:
Employee_df = Original_df.drop(["EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE","AVERAGE_PERCENTAGE_SALARY_HIKE","AVERAGE_PERFORMANCE_RATING","SALARY_RANGE","LATITUDE","LONGITUDE"], axis = 1)

In [19]:
CATEGORICAL_COLUMNS = ["QUALIFICATION","ETHNICITY","MARITAL_STATUS","GENDER","CONTINENT","COUNTRY","STATE","CITY","ROLE","LINE_OF_BUSINESS","DELIVERY_UNIT",
                      "PRACTICE_UNIT","EMPLOYMENT_TYPE","TURNOVER_REASONS","SHIFT","SALARY_LEVELS","JOB_SATISFACTION","OVER_TIME","DISTANCE"]
NUMERICAL_COLUMNS = ["BIRTH_YEAR","AGE","SALARY_INR","OVERTIME_HOURS","TENURE_MONTHS"]
LABEL_COLUMNS = ["CHURN"]
DROPPED_COLUMNS = ["EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE","AVERAGE_PERCENTAGE_SALARY_HIKE","AVERAGE_PERFORMANCE_RATING","SALARY_RANGE","LATITUDE","LONGITUDE"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [20]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL_COLUMNS),
        ('cat', categorical_transformer, CATEGORICAL_COLUMNS)
    ])

In [21]:
# Step 4: Train-Test Split
X = Employee_df[CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS]
y = Employee_df[LABEL_COLUMNS].values.ravel()  # Ensure y is a 1D array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Create and train the XGBClassifier model within a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

In [23]:
X_train.shape,y_test.shape

((240000, 24), (60000,))

In [24]:
# Preprocessing for numerical data
numerical_transformer = StandardScaler()

In [25]:
# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [11]:
# Filter feature columns
feature_columns = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
feature_columns = [col for col in feature_columns if col in Original_df.columns]
LABEL_COLUMNS = [col for col in LABEL_COLUMNS if col in Original_df.columns]
 
# Split data into features and labels
X = Original_df[feature_columns + DROPPED_COLUMNS]
y = Original_df[LABEL_COLUMNS].values.ravel()  # Flatten to 1D array for consistency

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
X_train.shape,y_test.shape

((225000, 32), (75000,))

In [18]:
# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.83192


In [None]:
X_test.info()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
 
# Define transformers
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
)
 
numerical_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler(clip=True)
)
 
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_COLUMNS),
        ('num', numerical_transformer, NUMERICAL_COLUMNS)
    ]
)
 
# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

pipeline.fit(X_train, y_train)
result = pipeline.predict(X_test)

In [None]:
result

In [None]:
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, log_loss, roc_auc_score
 
# Check lengths
print("Length of y_test:", len(y_test))
print("Length of y_pred:", len(y_pred))
 
# Print classification report
print(classification_report(y_test, y_pred))
 
# Calculate accuracy
accuracy = sum(y_test == y_pred) / len(y_test)
print("Accuracy:", accuracy)
 
# Calculate additional metrics
log_loss_value = log_loss(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob[:, 1])  # Assuming class 1 is the positive class
 
print("Log Loss:", log_loss_value)
print("ROC AUC Score:", roc_auc)

In [None]:
result = pipeline.predict(X_test)
result_prob = pipeline.predict_proba(X_test)
pred_df = X_test.copy()
result = result
result_prob = result_prob
pred_df["PREDICTION"] = result
pred_df["PROB"] = result_prob[:, 1]

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
import seaborn as sns
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, pred_df["PROB"])
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, pred_df["PROB"])
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Radnomforest')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, pred_df["PROB"])
plt.plot(lr_recall, lr_precision, marker='.', label='Randomforest')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
cm = confusion_matrix(y_test,y_pred)
 
#Plot the confusion matrix.
sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=['Not Churn','Churn'],
            yticklabels=['Not CHurn','Churn'])
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()