In [2]:
import argparse
import pandas as pd
from scipy.stats.mstats import winsorize
import lightgbm as lgb
import numpy as np
import pickle
import joblib

In [2]:
def harness(data_path, model = 'rf'):

    # Load dataset
    df = pd.read_csv(data_path)

    # transform column
    df['current_asst_liability_ratio'] = df['asst_current']/(df['asst_tot']-df['eqty_tot'])
    df['current_ratio_st'] = df['asst_current']/(df['debt_bank_st'] + df['debt_fin_st'] + df['AP_st'] + df['debt_st'])
    df['intangible_asst_ratio'] = df['asst_intang_fixed']/df['asst_tot']
    df['current_asst_ratio'] = df['asst_current']/df['asst_tot']
    df['cash_liability_ratio'] = df['cf_operations']/(df['debt_bank_st'] + df['debt_fin_st'] + df['AP_st'] + df['debt_st'])
    df['capital_employed'] = df['asst_tot'] - (df['debt_bank_st'] + df['debt_fin_st'] + df['AP_st'] + df['debt_st'])
    df['capital_turnover_ratio'] = (df['rev_operating']-df['COGS'])/df['capital_employed']
    df['ROCE'] = df['ebitda']/df['capital_employed']
    df['log_total_asset']=np.log(df['asst_tot'])
    df['debt_coverage_ratio'] = (df['ebitda'])/(df['debt_st'])
    df['leverage'] = (df['asst_tot'] - df['eqty_tot']) / df['eqty_tot']
    df['interest_coverage_ratio']=df['ebitda']/df['exp_financing']
    df['extraord_ratio'] = df['inc_extraord']/ df['rev_operating']
    df['finance_rev_ratio'] = df['inc_financing']/df['rev_operating']
    df['equity_asst_ratio'] = df['eqty_tot']/df['asst_tot']
    df['cash_asst_ratio'] = df['cf_operations']/df['asst_tot']

    # remove irrelavent columns
    df = df[['current_ratio_st',
 'roa',
 'cash_asst_ratio',
 'current_asst_ratio',
 'current_asst_liability_ratio',
 'debt_coverage_ratio',
 'intangible_asst_ratio',
 'capital_turnover_ratio',
 'equity_asst_ratio',
 'roe',
 'ROCE',
 'leverage',
 'log_total_asset',
 'cash_liability_ratio',
 'interest_coverage_ratio',
 'finance_rev_ratio',
 'capital_employed',
 'extraord_ratio']]

    # winsorize and fillna
    for col in df.columns:
        df[col] = pd.Series(winsorize(df[col],limits=[0.2,0.2],nan_policy='omit')) 
        df[col] = df[col].fillna(df[col].mean())
    
    if model =='lgb':
        clf = lgb.Booster(model_file='model/lgb_classifier.txt')
        pd.DataFrame(clf.predict(df),columns=['default_prob']).to_csv('output.csv')        
    else:
        clf = joblib.load('model/rfs_model.pkl')
        pd.DataFrame(clf.predict_proba(df),columns=['default_prob']).to_csv('output.csv')        



In [None]:
# input the holdout sample's path inside of this function
harness()