# ***Importing the required packages and splitting the dataset into test and train to demonstrate the train_model and project_1_scoring functions***

In [92]:
#importing pandas package
import pandas as pd
#reading the data
df = pd.read_csv("/content/SBA_loans_project_1.csv")

df_v1 = df.copy() #maintaining a copy of the dataframe

#splitting the dataset into test and train to demonstrate the below functions
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)


# ***train_model function***

In [87]:
#A Function which trains the model and returns the trained model path
import pandas as pd
import numpy as np
def train_model(data):
    """
    Train sample model and save artifacts
    """
    #importing and installing the necessary packages
    !pip install category_encoders
    !pip install h2o
    !pip install dill
    from copy import deepcopy
    from sklearn.linear_model import LogisticRegression
    import pickle
    import dill as pickle
    import category_encoders as ce
    from sklearn.preprocessing import OneHotEncoder
    
    # Handling the missing values for string type columns
    Handling_string_cols_missing = ['City','State','Bank','BankState','RevLineCr','LowDoc']
    for col in Handling_string_cols_missing:
      data[col].fillna("Missing",inplace=True)
    
    #Handling Missing values in numerical columns-dropping those values
    data.dropna(how='any',inplace=True)

    # some data transformations
    def df_manipulations(df):
      #encoding the target column as a binary variable
      MIS_Status_encoding = {"MIS_Status":{'P I F':0, 'CHGOFF': 1}}
      df.replace(MIS_Status_encoding,inplace=True)
      df['MIS_Status'] = df['MIS_Status'].astype(int)
      #changing the datatype of zip
      df['Zip'] = df['Zip'].astype(str)
      #df transformations
      #Converting the currency columns to float
      df['DisbursementGross'] = df['DisbursementGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
      df['BalanceGross'] = df['BalanceGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
      df['GrAppv'] = df['GrAppv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
      df['SBA_Appv'] = df['SBA_Appv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
      #dropping the index column
      df.drop('index',inplace=True,axis=1)
      return df
    data = df_manipulations(data)

    #one hot encoding the categorical columns
    ohe_columns = ['LowDoc','NewExist','UrbanRural']
    #one hot encoding the categorical columns
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # New in version 1.2: sparse was renamed to sparse_output
    data_ohe = ohe.fit_transform(data[ohe_columns])
    cols_ohe = ohe.get_feature_names_out()
    data_ohe = pd.DataFrame(data_ohe, columns=cols_ohe,index=data.index)
    data.drop(columns=ohe_columns,inplace=True)
    data = pd.concat([data,data_ohe],axis=1)

    # target column
    target_col = 'MIS_Status'
    y = data[target_col]
    
    #woe encoding the categorical variables
    woe_encoder = ce.woe.WOEEncoder()
    #fitting the encoding to the train dataset
    to_be_woe_transformed_cols = ['City','State','Zip','Bank','BankState','RevLineCr']
    woe_cols = ['City_woe','State_woe','Zip_woe','Bank_woe','BankState_woe','RevLineCr_woe']
    woe_encoder.fit(data[to_be_woe_transformed_cols], y)
    #transforming the training dataset
    data[woe_cols] =woe_encoder.transform(data[to_be_woe_transformed_cols])

    #preparing final feature dataframe
    final_features_cols = ['City_woe', 'State_woe', 'Zip_woe', 'Bank_woe', 'BankState_woe', 'NAICS', 'Term', 'NoEmp',
       'NewExist_0.0','NewExist_1.0','NewExist_2.0','UrbanRural_0','UrbanRural_1','UrbanRural_2','CreateJob', 'RetainedJob', 'FranchiseCode',
       'RevLineCr_woe','DisbursementGross', 'BalanceGross', 'GrAppv',
       'SBA_Appv','LowDoc_0','LowDoc_1', 'LowDoc_A', 'LowDoc_C', 'LowDoc_Missing', 'LowDoc_N',
       'LowDoc_R', 'LowDoc_S', 'LowDoc_Y','MIS_Status']
    data_final = data[final_features_cols]
    
    #H2O part--------------------------------------------------------------
    #intializing the h2o cluster
    import h2o
    from h2o.estimators.glm import H2OGeneralizedLinearEstimator
    h2o.init(max_mem_size = "14G")             #specify max number of bytes. uses all cores by default.
    h2o.remove_all()  
    
    #saving the column names into a variable
    col_list = list(data_final.columns)
    #converting the pandas dataframe to H2o frame
    train_h2o = h2o.H2OFrame(data_final,column_names = col_list)

    predictors = train_h2o.col_names[:-1]     #last column is whethere loan is approved or not
    response = target_col

    ## For binary classification, response should be a factor
    train_h2o[response] = train_h2o[response].asfactor()

    ##adding engineered features
    def cut_column(train_df, train,col,n_bins):
    
      only_col= train_df[col]                            #Isolate the column in question from the training frame
      counts, breaks = np.histogram(only_col, bins=n_bins)   #Generate counts and breaks for our histogram
      min_val = min(only_col)-1                          #Establish min and max values
      max_val = max(only_col)+1
      new_b = [min_val]                                  #Redefine breaks such that each bucket has enough support
      for i in range(n_bins-1):
          if counts[i] > 1000 and counts[i+1] > 1000:
              new_b.append(breaks[i+1])
      new_b.append(max_val)
      names = [col + '_' + str(x) for x in range(len(new_b)-1)]  #Generate names for buckets, these will be categorical names
      train[col+"_cut"] = train[col].cut(breaks=new_b, labels=names)

    def add_features(train):
      #pull train dataset into Python
      train_df = train.as_data_frame(True)
      #getting the first two digits of NAICS code - which define the industries.
      train['NAICS_sector'] = train['NAICS'] // 10000
      #Make categoricals for several columns
      cut_column(train_df, train,"Term",50)
      cut_column(train_df, train,"NoEmp",10)
      cut_column(train_df, train,"CreateJob",10)
      cut_column(train_df, train,"RetainedJob",10)
      cut_column(train_df, train,"DisbursementGross",100)
      cut_column(train_df, train,"GrAppv",100)
      cut_column(train_df, train, "SBA_Appv",100)
      #Add interaction columns for a subset of columns
      interaction_cols1 = [
                          "Term_cut",
                          "NoEmp_cut",
                          "CreateJob_cut",
                          "RetainedJob_cut",
                          "DisbursementGross_cut"
                    ]
      train_cols = train.interaction(factors=interaction_cols1,    #Generate pairwise columns
                                    pairwise=True,
                                    max_factors=1000,
                                    min_occurrence=100,
                                    destination_frame="itrain")
      
      train = train.cbind(train_cols)                              #Append pairwise columns to H2OFrames
      
      return train
  
    train_h2o_v1  = add_features(train_h2o)
    ##train the model with the best parameters
    glm = H2OGeneralizedLinearEstimator(family='binomial',alpha=0.68,lambda_ = 4.557E-4,seed=123,standardize=True)
    glm.train(x = predictors, y = response, training_frame = train_h2o_v1)

    ##threshold for max F1
    threshold = 0.3001702 ##took the threshold where validation data had max f1

    
    # save the model
    model_path = h2o.save_model(model=glm, path="/content/sample_data/artifacts/mymodel", force=True)
    #shutting down the cluster
    h2o.cluster().shutdown()

    #opening a pickle file and saving the artifacts dict
    artifacts_dict_file = open("/content/sample_data/artifacts/artifacts_dict_file.pkl", "wb")
    artifacts_dict = {
        "target_col":target_col,
        "model_path": model_path,  
        "threshold": threshold,
        "ohe_cols":ohe_columns,
        "woe_encoder":woe_encoder,
        "Handling_missing_cols": Handling_string_cols_missing,
        "feature_cols":final_features_cols,
        "woe_columns_to_be_transformed":to_be_woe_transformed_cols,
        "woe_columns":woe_cols,
        "OneHotEncoder":ohe
        
       }
    #dumping the dictionary
    pickle.dump(obj=artifacts_dict, file=artifacts_dict_file)
    #closing the pickle file
    artifacts_dict_file.close() 

    #opening a new pickle file for functions as the previous pickle file can't serialize functions and as well as artifacts_dict in it
    functions_pkl_file =  open("/content/sample_data/artifacts/functions.pkl", "wb")
    pickle.dump(obj=df_manipulations,file=functions_pkl_file)
    pickle.dump(obj=add_features,file=functions_pkl_file)  
    functions_pkl_file.close()
 
    return model_path


# ***train_model function demonstration with training dataset***

In [88]:
#demonstrating the train_model function
model_path_v1 = train_model(train)
model_path_v1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  df['DisbursementGross'] = df['DisbursementGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['BalanceGross'] = df['BalanceGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['GrAppv'] = df['GrAppv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['SBA_Appv'] = df['SBA_Appv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.18" 2023-01-17; OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1); OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.9/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmptqkjxsga
  JVM stdout: /tmp/tmptqkjxsga/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmptqkjxsga/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_unknownUser_m8v5p7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,14 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Interactions progress: |█████████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
H2O session _sid_90f0 closed.


'/content/sample_data/artifacts/mymodel/GLM_model_python_1680470792403_1'

# ***project_1_scoring function***

In [91]:
def project_1_scoring(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    #installing the packages and importing them
    !pip install category_encoders
    !pip install h2o
    !pip install dill
    from copy import deepcopy
    from sklearn.linear_model import LogisticRegression
    import pickle
    import category_encoders as ce
    import dill as pickle
    
    '''Load Artifacts'''
    artifacts_dict_file = open("/content/sample_data/artifacts/artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()

    
    #saving all the necessary variables from artifacts file
    target_col = artifacts_dict['target_col']
    model_path = artifacts_dict['model_path']
    threshold = artifacts_dict['threshold']
    ohe_columns = artifacts_dict['ohe_cols']
    woe_encoder = artifacts_dict['woe_encoder']
    Handling_string_cols_missing = artifacts_dict['Handling_missing_cols']
    final_features_cols = artifacts_dict['feature_cols']
    to_be_woe_transformed_cols = artifacts_dict['woe_columns_to_be_transformed']
    woe_cols = artifacts_dict['woe_columns']
    ohe = artifacts_dict['OneHotEncoder']
    
    # Handling the missing values for string type columns #Replacing the nulls in categorical columns with 'MISSING'
    for col in Handling_string_cols_missing:
      data[col].fillna("Missing",inplace=True)
  
    #Handling Missing values in numerical columns-dropping those values
    data.dropna(how='any',inplace=True)

    #copying original data into a pandas dataframe
    data_original = data.copy()
    original_data_columns = list(data_original.columns)

    '''Load functions pickle file'''
    #some data transformations using df_manipulations function
    functions_pkl_file =  open("/content/sample_data/artifacts/functions.pkl", "rb")
    data = pickle.load(file=functions_pkl_file)(data)
    
    #One hot encoding the categorical variables
    data_ohe = ohe.transform(data[ohe_columns])
    cols_ohe = ohe.get_feature_names_out()
    data_ohe = pd.DataFrame(data_ohe, columns=cols_ohe,index=data.index)
    data.drop(columns=ohe_columns,inplace=True)
    data = pd.concat([data,data_ohe],axis=1)

    # target column
    y = data[target_col]
    
    #woe encoding the categorical variables
    data[woe_cols] =woe_encoder.transform(data[to_be_woe_transformed_cols])

    #preparing the final dataframe
    data_final = data[final_features_cols]
    
    #H2O part--------------------------------------------------------------
    #initializing the H2o cluster
    import h2o
    from h2o.estimators.glm import H2OGeneralizedLinearEstimator
    h2o.init(max_mem_size = "14G")             #specify max number of bytes. uses all cores by default.
    h2o.remove_all()  
    
    #saving the column names in a variable
    col_list = list(data_final.columns)
    #converting the pandas dataframe to h2o frame
    test_h2o = h2o.H2OFrame(data_final,column_names = col_list)

    #adding the additional features

    #last column is whethere loan is approved or not
    predictors = test_h2o.col_names[:-1]     
    response = target_col

    ## For binary classification, response should be a factor
    test_h2o[response] = test_h2o[response].asfactor()

    ##adding the features
    test_h2o_v1 = pickle.load(file=functions_pkl_file)(test_h2o)
    functions_pkl_file.close()

    #loading the trained model
    glm = h2o.load_model(model_path)

    #converting the original dataframe to pandas dataframe to concat it with the predicted probabilities
    original_data_h2o = h2o.H2OFrame(data_original,column_names = original_data_columns) 

    #getting the predicted probabilities  
    y_pred_proba = glm.predict(test_h2o_v1)

    #concating the original frame with the predictions frame
    results = original_data_h2o.cbind(y_pred_proba)

    #coverting the resulting dataframe to pandas dataframe
    results1 = results.as_data_frame()

    #renaming the columns
    some_dict = {'predict': 'predicted_class',
                 'p0': 'probability_for_class_0_(PIF)',
                 'p1': 'probability_for_class_1_(CHGOFF)'}
    results1.rename(columns=some_dict, inplace=True)

    #selecting the relevant columns
    results2 = results1[['index','predicted_class','probability_for_class_0_(PIF)','probability_for_class_1_(CHGOFF)']]

    #setting the index variable as index
    results2.set_index('index', inplace=True)

    return results2

# ***project_1_scoring function demonstration with test data ---> this returns the prediction dataframe***

In [93]:
predicted_class =project_1_scoring(test)
predicted_class

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  df['DisbursementGross'] = df['DisbursementGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['BalanceGross'] = df['BalanceGross'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['GrAppv'] = df['GrAppv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)
  df['SBA_Appv'] = df['SBA_Appv'].str.replace("'", '').str.replace('$', '').str.replace(",", '').astype(float)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 min 20 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_unknownUser_lgrq0f
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,13.99 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Interactions progress: |█████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0_level_0,predicted_class,probability_for_class_0_(PIF),probability_for_class_1_(CHGOFF)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
597590,0,0.698183,0.301817
540332,0,0.915363,0.084637
587189,0,0.850192,0.149808
631715,0,0.852126,0.147874
666323,0,0.866587,0.133413
...,...,...,...
802165,0,0.915846,0.084154
709022,0,0.983376,0.016624
706947,0,0.999302,0.000698
653212,0,0.917970,0.082030
