# Production Notebook

In [2]:
import pandas as pd
import joblib

# Load the saved pipeline
model = joblib.load(r"/Users/senakshikrishnamurthy/Desktop/Sena/Project/Employee Departure Prediction/final_model_pipeline.pkl")

# Inspect the pipeline structure
print(model.named_steps)

{'classifier': RandomForestClassifier(max_depth=20, n_estimators=200, random_state=42)}


In [3]:
# In the second notebook
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import numpy as np



def production(X_path, y_path):
    # load model
    model = joblib.load(r"/Users/senakshikrishnamurthy/Desktop/Sena/Project/Employee Departure Prediction/final_model_pipeline.pkl")
    
    # load data
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)['Left']
    # Preprocessing - match training preprocessing
    df_X['PreviousSalary'] = df_X['PreviousSalary'].str.replace('K', '').astype(float) * 1000
    df_X['Salary'] = df_X['Salary'].str.replace('K', '').astype(float) * 1000
    df_X['SalaryIncreasePercentage'] = ((df_X['Salary'] - df_X['PreviousSalary']) / df_X['PreviousSalary']) * 100
    df_X['SalaryIncreasePercentage'] = df_X['SalaryIncreasePercentage'].round(2)
    df_X['Distance'].fillna('Unknown', inplace=True)
    
    # Apply preprocessing
    # Define preprocessing (same as training)
    cat_cols = ['Distance']
    num_cols = ['Gender', 'YearsWorked', 'TrainingHours', 'WorkLifeBalance', 
            'NumOfProjects', 'JobInvolvement', 'TeamSize', 'MentorshipReceived', 
            'TechSkillLevel', 'AttendanceRate', 'StressLevel', 'PeerFeedbackScore', 
            'AnnualLeaveDays', 'Certifications', 'SkillDevelopmentCourses', 
            'ProjectComplexity', 'WorkSatisfactionScore', 'JobEngagementScore', 
            'PhysicalActivityScore', 'MentalWellbeingScore', 'DepartmentCode', 
            'PreviousSalary', 'Salary', 'SelfReview', 'SupervisorReview', 
            'SalaryIncreasePercentage']

    num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
     ])
    X_transformed = preprocessor.fit_transform(df_X)

    # Align columns with the training data
    expected_columns = model.named_steps['classifier'].n_features_in_
    if X_transformed.shape[1] > expected_columns:
        X_transformed = X_transformed[:, :expected_columns]  # Drop extra columns
    elif X_transformed.shape[1] < expected_columns:
        missing_cols = expected_columns - X_transformed.shape[1]
        X_transformed = np.hstack((X_transformed, np.zeros((X_transformed.shape[0], missing_cols))))  # Add missing columns

    
    # -------------------------
    # Split the Data into Train and Test Sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, df_y, test_size=0.2, random_state=42, stratify=df_y
    )

    # Apply SMOTE to the Training Set Only
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # -------------------------
    # Hyperparameter Tuning
    from sklearn.linear_model import LogisticRegression

    # Define hyperparameter distributions
    param_distributions = {
        'C': np.logspace(-3, 3, 7),  # Regularization strength
        'class_weight': [{0: 1, 1: w} for w in np.linspace(1, 5, 10)]
    }

    # RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=LogisticRegression(max_iter=500, random_state=42),
        param_distributions=param_distributions,
        n_iter=50,
        scoring='precision',
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train_resampled, y_train_resampled)

    # Retrieve the best model
    best_model = random_search.best_estimator_

    # -------------------------
    # Predictions and Evaluation
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Compute and print weighted precision
    weighted_precision = precision_score(y_test, y_pred, average='weighted')
    print(f"Weighted Precision: {weighted_precision:.4f}")


# calling the function to test
production(
    X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X_prod.csv',
    y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y_prod.csv'
)
    


    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_X['Distance'].fillna('Unknown', inplace=True)


Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.75      0.80     12809
           1       0.63      0.76      0.69      7191

    accuracy                           0.75     20000
   macro avg       0.74      0.76      0.74     20000
weighted avg       0.77      0.75      0.76     20000

Weighted Precision: 0.7707
