## Model Deployment

In [1]:
# Author information
__author__ = "Troy Reynolds"
__email__ = "Troy.Lloyd.Reynolds@gmail.com"

In [2]:
# libraries
import pandas as pd
import numpy as np
import sys
import os
import joblib
import inspect

# Extend the directory to get created functions
sys.path.insert(0, "./function_scripts")
sys.path.insert(0, "./model")
sys.path.insert(0, "./results")

# import helper functions
from data_import_functions import get_data
from Preprocessing import cv_mse_stats
from results import display_search_results
from Deployment_helper import deployment_pipeline

In [3]:
# load in the data
train_data = get_data("train", key = "jobId", target_variable = "salary", remove_zeros = True)
test_data = get_data("test", key = "jobId", target_variable = "salary", remove_zeros = True)

# drop id variables besides JobID for pairing in baseline regressor
features = train_data.drop(["jobId","companyId", "salary"], axis = 1)
target = train_data["salary"]

# Preview of test
test_data.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
0,JOB1362685407687,COMP33,MANAGER,HIGH_SCHOOL,NONE,HEALTH,22,73
1,JOB1362685407688,COMP13,JUNIOR,NONE,NONE,AUTO,20,47
2,JOB1362685407689,COMP10,CTO,MASTERS,BIOLOGY,HEALTH,17,9
3,JOB1362685407690,COMP21,MANAGER,HIGH_SCHOOL,NONE,OIL,14,96
4,JOB1362685407691,COMP36,JUNIOR,DOCTORAL,BIOLOGY,OIL,10,44


## Production Pipeline
For the model be useful, the model needs to be immediately ready to be put into production. A function pipeline was created to take raw inputs with the same format as the training data, process the data using the best pipeline, and generate predictions using the best model created in the previous notebook. Some of the predictions are presented below. Furthermore, the function stores the predictions as a CSV file in the results folder with a timestamp in the filename.

In [4]:
# inspection of the function
print(inspect.getsource(deployment_pipeline))

def deployment_pipeline(data):
    """
    Pipeline that takes raw data, processes it, and generates predictions
    using the previously found best model. The best model can be updated
    by replacing the best model file.
    
    Parameters: 
    data: DataFrame
        raw data
        
    Returns:
    results: DataFrame [jobId, predicted_salary]
    """
    # copy features to preserve data integrety
    features = data.copy()
    
    # save Id
    jobId = features["jobId"]
    
    # drop unnecessary features
    selected_features = features.drop(["jobId", "companyId"], axis = 1)
    
    # get pretrained pipeline
    processed_features = preprocess_pipeline(selected_features)
    
    # get predictions
    predictions = model_predict(processed_features)

    # combine jobId with predicted salary    
    predictions_with_id = pd.DataFrame({"jobId": jobId, 
                                        "predicted_salary": predictions})
    
    # save predictions as a csv file
    save

In [5]:
#### generate predictions and show output
predictions = deployment_pipeline(test_data)
predictions

Saved to .\results\Salary_Predictions_2021-09-18--13-13-51.csv


Unnamed: 0,jobId,predicted_salary
0,JOB1362685407687,111.398280
1,JOB1362685407688,92.827118
2,JOB1362685407689,183.198284
3,JOB1362685407690,103.900301
4,JOB1362685407691,116.060492
...,...,...
999995,JOB1362686407682,167.595241
999996,JOB1362686407683,106.741677
999997,JOB1362686407684,55.195601
999998,JOB1362686407685,161.261267
