In [26]:
import joblib
import pandas as pd

In [39]:
# In the second notebook

from sklearn.metrics import classification_report

def production(X_path, y_path):
    # load model
    #model = TestModel()
    
    feature_names = joblib.load("final_model_features.pkl")
    
    # load data
    df_X = pd.read_csv(X_path)

    # make the changes if required 
    # -------------------------

    #  Cleaning Distance Column

    distance_map = {
    '~10miles': 10,
    '~15miles': 15,
    '~20miles': 20,
    '~25miles': 25,
    '>30miles': 35,
    '<5mile': 5
     }
    df_X['Distance'] = df_X['Distance'].map(distance_map)

    for col in ['PreviousSalary', 'Salary']:
        if df_X[col].dtype == 'object':
            df_X[col] = df_X[col].str.replace("K", "", regex=False).astype(float)
    
    # Handling Any Remaining Missing Values
    df_X.fillna(df_X.median(numeric_only=True), inplace=True)
    
    # create feature engineering 
    
    df_X["SalaryGrowth"] = (df_X["Salary"] - df_X["PreviousSalary"]) / df_X["PreviousSalary"]
    df_X["ReviewDifference"] = df_X["SelfReview"] - df_X["SupervisorReview"]
    df_X["WorkLifeProjectRatio"] = df_X["WorkLifeBalance"] / (df_X["NumOfProjects"] + 1)
    df_X["TrainingPerYear"] = df_X["TrainingHours"] / (df_X["YearsWorked"] + 1)
    df_X["EngagementPerProject"] = df_X["JobEngagementScore"] / (df_X["NumOfProjects"] + 1)
    df_X["HealthImpactScore"] = (df_X["PhysicalActivityScore"] + df_X["MentalWellbeingScore"]) / 2
    median_salary = df_X["Salary"].median()
    df_X["HighPayLowSatisfaction"] = ((df_X["Salary"] > median_salary) & (df_X["WorkSatisfactionScore"] < 4)).astype(int)
    df_X["JobSecurityScore"] = df_X["YearsWorked"] * df_X["NumOfProjects"]
    df_X["YearsWorkedZero"] = (df_X["YearsWorked"] == 0).astype(int)

    # Drop any features that were dropped during training
    df_X = df_X.drop(columns=["ReviewDifference", "HighPayLowSatisfaction"], errors="ignore")

    # Selecting Features
    df_X = df_X[feature_names]
    

    # -------------------------
    model = joblib.load("final_model.pkl")
    pred = model.predict(df_X)

    df_y = pd.read_csv(y_path)['Left']
    print(classification_report(df_y, pred))



In [41]:
production( 

  X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X_prod.csv',

  y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y_prod.csv'

)

              precision    recall  f1-score   support

           0       0.68      0.79      0.73     64044
           1       0.48      0.35      0.40     35956

    accuracy                           0.63    100000
   macro avg       0.58      0.57      0.57    100000
weighted avg       0.61      0.63      0.61    100000

