In [5]:
def project_2_scoring(df):

    import pickle
    import h2o
    import pandas as pd
    import numpy as np
    import re

    '''Load Artifacts'''
    artifacts_dict_file = open("../artifacts/artifacts_project_2.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()


    categorical_encoders = artifacts_dict["categorical_encoders"]
    to_categorical = artifacts_dict["to_categorical"]
    numerical_cols_to_cat = artifacts_dict["numerical_cols_to_cat"]
    columns_to_trg_encode = artifacts_dict["columns_to_trg_encode"]
    h2o_threshold = artifacts_dict["h2o_threshold"]
    bin_edges_dict = artifacts_dict["bin_edges_dict"]
    numerical_to_categorical = artifacts_dict["numerical_to_categorical"]


    def preprocess_data(df):

        # Convert ZipCode to string and add leading zero if necessary
        df['Zip'] = df['Zip'].astype(str).apply(lambda x: x.zfill(5))

        # Convert City and Bank names to uppercase for uniformity.
        df['City'] = df['City'].str.upper().str.strip()
        df['Bank'] = df['Bank'].str.upper().str.strip()


        # Function to remove non-alphabetic characters
        def remove_non_alphabetic(text):
            # Check if the value is a string or can be converted to a string
            if isinstance(text, str):
                # Using regex to remove non-alphabetic characters
                return re.sub(r'[^a-zA-Z]', ' ', text)
            else:
                # Handle non-string values (you can choose to return None or something else as needed)
                return text

        # Apply the function to the 'City' column
        df['City'] = df['City'].apply(remove_non_alphabetic)


        # Define a function to apply the encoding to FranchiseCode
        def custom_encode(value):
            if value == 1:
                return 1
            elif value == 0:
                return 0
            else:
                return 2

        df['FranchiseCode'] = df['FranchiseCode'].apply(custom_encode)


        # Recode 'NewExist' column.
        df.loc[(df['NewExist'] == 0) | (df['NewExist'] == 1), 'NewExist'] = 0
        df.loc[(df['NewExist'] == 2), 'NewExist'] = 1


        # Fill missing values in numerical columns with 0
        numerical_cols = df.select_dtypes(include=['number']).columns
        df[numerical_cols] = df[numerical_cols].fillna(0)


        # Fill missing values in categorical columns with 'Missing'
        categorical_cols = df.select_dtypes(include=['object']).columns
        df[categorical_cols] = df[categorical_cols].fillna('Missing')

        return df

    df = preprocess_data(df)

    for cols in to_categorical:
        df[cols] = df[cols].astype('object')

    def featurize_data(df):

        # Create a new feature 'Bank_BankState' to identify each State-wise entity of the bank.
        df['Bank_BankState'] = df['Bank']+'_'+df['BankState']

        # Create a new feature 'State_BankState_Same' which has a value of 1 if 'State' and 'BankState' are the same, otherwise 0
        df['State_BankState_Same'] = (df['State'] == df['BankState']).astype(int)

        # Create a new feature 'City_State' to identify each City State-wise.
        df['City_State'] = df['City']+'_'+df['State']

        # Extract first four and three digits of Zip to separate out different geographical sections.
        df['Zip3'] = df['Zip'].str[:3]
        df['Zip4'] = df['Zip'].str[:4]

        # Create a new feature 'DisbursementGross_minus_GrAppv' to get the additional amount that was Disbursed than the approved amount.
        df['DisbursementGross_minus_GrAppv'] = df['DisbursementGross']-df['GrAppv']

        # Create a new feature 'DisbursementGross_minus_SBA_Appv' to get the additional amount that was Disbursed than the SBA approved amount.
        df['DisbursementGross_minus_SBA_Appv'] = df['DisbursementGross']-df['SBA_Appv']

        # Create a new feature 'GrAppv_minus_SBA_Appv' to get the difference in the total Approved Amount and SBA Approved Amount .
        df['GrAppv_minus_SBA_Appv'] = df['GrAppv']-df['SBA_Appv']

        # Create a new feature 'NoEmp_minus_RetainedJob' to get the amount of people that were downsized during the duration of the loan.
        df['NoEmp_minus_RetainedJob'] = df['NoEmp']-df['RetainedJob']

        return df

    df = featurize_data(df)


    #Encoding of Catergorical columns.
    for col in columns_to_trg_encode:
        woe_encoder = categorical_encoders[(col, "trg")]
        df[col + '_trg'] = woe_encoder.transform(df[col])

    def bin_numerical_columns(numerical_cols, df, bins):
        for col in numerical_cols:

            bin_edges = bin_edges_dict[(col, "bin_edge")]

            # Handle outliers in validation and test datasets
            df[f'{col}_binned'] = pd.cut(df[col], bins=bin_edges, labels=False, include_lowest=True)

            # Replace NaNs for out of bounds values
            df[f'{col}_binned'] = df[f'{col}_binned'].fillna(bins - 1).astype(int)

    bin_numerical_columns(numerical_cols_to_cat, df, bins=50)

    h2o.init(max_mem_size = "4G");
    h2o.remove_all()

    h2o_df = h2o.H2OFrame(df)

    numerical_to_categorical = ['Zip', 'NAICS', 'NewExist', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 'State_BankState_Same']

    for cols in numerical_to_categorical:
        h2o_df[cols] = h2o_df[cols].asfactor()

    model_path = "../artifacts/best_model_project_2/tuning_grid_model_51"
    h2o_best_model = h2o.load_model(model_path)

    Predictions = h2o_best_model.predict(h2o_df).as_data_frame()
    output = pd.concat([df["index"], Predictions], axis=1)
    output.columns = ['index', 'label', 'probability_0', 'probability_1']

    result_df = pd.DataFrame(output)


    result_df_to_save = result_df[['index', 'probability_1']]
    # Rename the columns
    result_df_to_save.columns = ['ID', 'probability_1']
    # Save to CSV
    result_df_to_save.to_csv('result_to_check_df.csv', index=False)


    h2o.cluster().shutdown()
    return(result_df)



In [6]:
import pandas as pd

data = pd.read_csv('../data/SBA_loans_project_2_holdout_students_valid.csv')

result_df = project_2_scoring(data)

result_df.head()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpqzfkzws5
  JVM stdout: /tmp/tmpqzfkzws5/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpqzfkzws5/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,7 months and 18 days
H2O_cluster_name:,H2O_from_python_unknownUser_zmijjz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Export File progress: |



██████████████████████████████████████████████████████████| (done) 100%
H2O session _sid_92e6 closed.


Unnamed: 0,index,label,probability_0,probability_1
0,0,0,0.947973,0.052027
1,1,0,0.834189,0.165811
2,2,0,0.838573,0.161427
3,3,0,0.979398,0.020602
4,4,0,0.822446,0.177554


In [7]:
result_df

Unnamed: 0,index,label,probability_0,probability_1
0,0,0,0.947973,0.052027
1,1,0,0.834189,0.165811
2,2,0,0.838573,0.161427
3,3,0,0.979398,0.020602
4,4,0,0.822446,0.177554
...,...,...,...,...
99803,99803,1,0.708789,0.291211
99804,99804,0,0.888892,0.111108
99805,99805,0,0.955747,0.044253
99806,99806,0,0.865310,0.134690
