## Model Scoring

Write function that will load artifacts from above, transform and score on a new dataset.
Your function should return Python list of labels. For example: [0,1,0,1,1,0,0]

Don't copy the code as is. It is provided as an example only. 
- Function `train_model` - you need to focus on model and encoder saving:
    ```
    pickle.dump(obj=clf, file=log_reg_file)
    pickle.dump(obj=cat_encoders, file=encoders_file)
    ```
- Function `project_1_scoring` - you should have similar function with name `project_1_scoring`. The function will:
    - Get Pandas dataframe as parameter
    - Will load model and all needed encoders
    - Will perform needed manipulations on the input Pandas DF - in the exact same format as input file for the project, minus MIS_Status feature
    - Return probabilities as numpy array or Python list

In [1]:

def project_1_scoring(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    from category_encoders import TargetEncoder
    from copy import deepcopy
    from sklearn.tree import DecisionTreeClassifier
    import pickle
    import numpy as np
    
    '''Replacing Na/Null values'''
    values_to_fill = {}
    for col in data.drop(columns=['MIS_Status']).columns:
        if data[col].dtype == 'object':
            values_to_fill[col] = "Missing"
        else:
            values_to_fill[col] = 0

    data.fillna(value=values_to_fill,inplace=True)
    
    '''Converting the strings styled as '$XXXX.XX' to float values'''
    col_toFloat = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    for col in col_toFloat:
        data[col] = data[col].apply(lambda x: (x.replace(' ','')))
        data[col] = data[col].apply(lambda x: (x.replace(',','')))
        data[col] = data[col].apply(lambda x: (x.replace('$','')))
        data[col] = data[col].astype(float)
        
        
    data['LoanDisbursedPerCity'] = data.groupby('City')['DisbursementGross'].transform(np.sum)
    data['LoanPaid'] = data['DisbursementGross'] - data['BalanceGross']
    
    X = data.copy()
    scaler_cols = ['Zip', 'NAICS', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 
                   'FranchiseCode', 'UrbanRural', 'DisbursementGross', 'BalanceGross', 'GrAppv', 
                   'SBA_Appv', 'LoanDisbursedPerCity', 'LoanPaid']
    target_cols = ['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc']
    
      
    
    model_filename = './artifacts/finalized_model.pkl'
    encoder_filename = './artifacts/cat_Encoders.pkl'
    scaler_filename = './artifacts/scaler_Encoders.pkl'
    
    
    '''Load Model and encoders'''
    model_dtc_file = open(model_filename, "rb")
    encoders_file = open(encoder_filename, "rb")
    scaler_file = open(scaler_filename, "rb")
    
    dtc = pickle.load(file=model_dtc_file)
    enc_dict = pickle.load(file=encoders_file)
    scaler_dtc = pickle.load(file=scaler_file)
   
    
    drop_columns = ['MIS_Status']
    
    '''Encode categorical columns using target encoding'''
    for col in target_cols:
        
        enc = enc_dict[col][0]       
        X[col + '_trg'] = enc.transform(X[[col]])
        
        X.drop([col], axis=1, inplace=True)
        
        
    '''Encode numericals columns using MinMax scaler'''
    X_temp = X.copy()
    X_org = X[scaler_cols]    
    X_temp = scaler_dtc.transform(X_org)
   
    enc_col_names = []
    for val in scaler_cols:
        enc_col_names.append(val + "_sc")
   
    X[enc_col_names] = X_temp    
    X.drop(scaler_cols,axis=1,inplace=True)
       
    columns_to_score = set(X.columns) - set(drop_columns) 
    
    y_pred = dtc.predict(X[columns_to_score])
    
    
#     X = X.dropna()
#     X['MIS_Status'] = X['MIS_Status'].map({'P I F':0, 'CHGOFF':1}).astype(int)
#     yt = X['MIS_Status']
#     Xt = X.drop(['MIS_Status'], axis=1)
#     print(dtc.score(Xt,yt))
    
    model_dtc_file.close()
    encoders_file.close()
    scaler_file.close()
    
    return y_pred

In [2]:
import pandas as pd
data_hold_out = pd.read_csv(r'C:\Users\General\Documents\UTD\Semester 3\Applied Machine Learning\Projects\SBA_loans_project_1\X_test_utkarsha.csv')

In [3]:
import warnings
warnings.filterwarnings('ignore')
ans = project_1_scoring(data_hold_out)

In [4]:
ans

array([0, 0, 0, ..., 0, 0, 0])