# Production Notebook
## Connor Bruce and Sujan Neupane

In [1]:
import joblib
import pandas as pd
import numpy as np
    
model = joblib.load('random_forest_model.pkl')

model

In [2]:
from sklearn.metrics import classification_report

def production(X_path, y_path):
    # load model
    #model = model
    
    # load data
    df_X = pd.read_csv(X_path)

    # make the changes if required 
    # -------------------------
    df_X = df_X.astype({col: 'int32' for col in df_X.select_dtypes('int64').columns})
    df_X = df_X.astype({col: 'float32' for col in df_X.select_dtypes('float64').columns})
    
    df_X['Salary'] = df_X['Salary'].str.replace("K",'').astype(float)*1000
    df_X['PreviousSalary'] = df_X['PreviousSalary'].str.replace("K",'').astype(float)*1000

    df_X['rating_diff'] = df_X['SelfReview']-df_X['SupervisorReview']

    df_X['salary_increase'] = df_X['Salary']/df_X['PreviousSalary']

    df_X['YearsWorkedAdj'] = np.where(df_X['YearsWorked']==0,1, df_X['YearsWorked'])
    df_X['salary_per_year'] = df_X['Salary'] / df_X['YearsWorkedAdj']
    
    df_X['proj_per_year'] = df_X['NumOfProjects'] / df_X['YearsWorkedAdj']
    

    # -------------------------
    pred = model.predict(df_X)

    df_y = pd.read_csv(y_path)['Left']
    print(classification_report(df_y, pred))

In [3]:
production( 
    X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X.csv',
    y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y.csv'
)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    319539
           1       0.64      0.68      0.66    180461

    accuracy                           0.75    500000
   macro avg       0.73      0.73      0.73    500000
weighted avg       0.75      0.75      0.75    500000

