In [None]:
!pip install --q imbalanced-learn
!pip install fosforml 
!pip install fosforio
!pip install seaborn

In [None]:
# Statitics and Basic data support
from scipy.stats.mstats import winsorize
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling Training and Tunning
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score, recall_score, f1_score,log_loss, roc_auc_score,roc_curve, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Imbalance Handling
from imblearn.over_sampling import SMOTE

# Addititional
from collections import Counter
import joblib

# Fosforml
from fosforml import register_model
from fosforml.constants import MLModelFlavours
from fosforml.model_manager.snowflakesession import get_session

# Warnings
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [None]:
df = 'EMPLOYEE_DATASET'

In [None]:
sf_df = my_session.sql("select * from {}".format(df))

In [None]:
df = sf_df.to_pandas()

In [None]:
print(df.isnull().sum())

In [None]:
Original_df = df

In [None]:
Original_df

In [None]:
Employee_df = Original_df.drop(["EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE","AVERAGE_PERCENTAGE_SALARY_HIKE","AVERAGE_PERFORMANCE_RATING","SALARY_RANGE","LATITUDE","LONGITUDE"], axis = 1)

In [None]:
CATEGORICAL_COLUMNS = ["QUALIFICATION","ETHNICITY","MARITAL_STATUS","GENDER","CONTINENT","COUNTRY","STATE","CITY","ROLE","LINE_OF_BUSINESS","DELIVERY_UNIT",
                      "PRACTICE_UNIT","EMPLOYMENT_TYPE","TURNOVER_REASONS","SHIFT","SALARY_LEVELS","JOB_SATISFACTION","OVER_TIME","DISTANCE"]
NUMERICAL_COLUMNS = ["BIRTH_YEAR","AGE","SALARY_INR","OVERTIME_HOURS","TENURE_MONTHS"]
LABEL_COLUMNS = ["CHURN"]
DROPPED_COLUMNS = ["EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE","AVERAGE_PERCENTAGE_SALARY_HIKE","AVERAGE_PERFORMANCE_RATING","SALARY_RANGE","LATITUDE","LONGITUDE"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [None]:
# Preprocessing for numerical data
numerical_transformer = StandardScaler()

In [None]:
# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL_COLUMNS),
        ('cat', categorical_transformer, CATEGORICAL_COLUMNS)
    ])

In [None]:
# Create and train the XGBClassifier model within a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
 
# Define transformers
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
)
 
numerical_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler(clip=True)
)
 
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_COLUMNS),
        ('num', numerical_transformer, NUMERICAL_COLUMNS)
    ]
)
 
# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

pipeline.fit(X_train, y_train)
result = pipeline.predict(X_test)

In [None]:
# Step 4: Train-Test Split
X = Employee_df[CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS]
y = Employee_df[LABEL_COLUMNS].values.ravel()  # Ensure y is a 1D array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape,y_test.shape

In [None]:
# Train the model
model.fit(X_train, y_train)

# Step 6: Model Evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
log_loss_value = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
roc_curve_data = roc_curve(y_test, y_pred_proba)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Log Loss:", log_loss_value)

In [None]:
from joblib import dump, load
filename = "HR_Attrition.joblib"
dump(pipeline, filename)

In [None]:
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)

In [None]:
result = pipeline.predict(X_test)
result_prob = pipeline.predict_proba(X_test)
pred_df = X_test.copy()
result = result
result_prob = result_prob
pred_df["PREDICTION"] = result
pred_df["PROB"] = result_prob[:, 1]

In [None]:
from sklearn.metrics import classification_report, log_loss, roc_auc_score
 
# Check lengths
print("Length of y_test:", len(y_test))
print("Length of y_pred:", len(y_pred))
 
# Print classification report
print(classification_report(y_test, y_pred))
 
# Calculate accuracy
accuracy = sum(y_test == y_pred) / len(y_test)
print("Accuracy:", accuracy)
 
# Calculate additional metrics
log_loss_value = log_loss(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob[:, 1])  # Assuming class 1 is the positive class
 
print("Log Loss:", log_loss_value)
print("ROC AUC Score:", roc_auc)

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
import seaborn as sns
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, pred_df["PROB"])
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, pred_df["PROB"])
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Radnomforest')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, pred_df["PROB"])
plt.plot(lr_recall, lr_precision, marker='.', label='Randomforest')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
cm = confusion_matrix(y_test,y_pred)
 
#Plot the confusion matrix.
sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=['Not Churn','Churn'],
            yticklabels=['Not CHurn','Churn'])
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
# Define or use the prediction function
def model_prediction_score_func(dataframe):
    # Ensure 'dataframe' has the correct features required by the model
    return pipeline.predict(dataframe)  # Use your trained pipeline/model here
 
# Assuming df is your input DataFrame with the necessary features
Original_df['Model_Output'] = model_prediction_score_func(Original_df)