In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_parquet("../data/processed/train_withlabels.parquet")

In [4]:
train = train.groupby('customer_ID').tail(1).set_index('customer_ID')

In [5]:
features = train.columns.to_list()
features = features[2:]

In [7]:
train.target.value_counts()

0    340085
1    118828
Name: target, dtype: int64

In [8]:
def iv_woe(data, target, bins=10, show_woe=False, show_iv= False, split_max= False):
    import re

    
    iv_relevance_dict={"not_useful":[],
                       "useful":[],
                      }
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    lst=[]
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d0 = d0.astype({"x": str})
        d = d0.groupby("x", as_index=False, dropna=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Good']
        d.insert(loc=0, column='Variable', value=ivars)
    
        d['% of Good'] = np.maximum(d['Good'], 0.5) / d['Good'].sum()
        d['Bad'] = d['N'] - d['Good']
        d['% of Bad'] = np.maximum(d['Bad'], 0.5) / d['Bad'].sum()
        d['WoE'] = np.log(d['% of Good']/d['% of Bad'])
        d['IV'] = d['WoE'] * (d['% of Good']-d['% of Bad'])

        
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)
        
        #Show IV_values:
        if show_iv:
            print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))

        #Show WOE Table
        if show_woe == True:
            print(d)
        
        
    
    #Creating a list of usefol and not useful features
    for i,v in newDF.iterrows():
        check = v["IV"]
        if check < 0.02:
            iv_relevance_dict["not_useful"].append(v[i])
        elif 0.02 < check < 0.1:
            iv_relevance_dict["useful"].append(v[i])
        elif 0.01 <= check < 0.3:
            iv_relevance_dict["useful"].append(v[i])
        elif 0.03 <= check < 0.5:
            iv_relevance_dict["useful"].append(v[i])
        else:
            iv_relevance_dict["not_useful"].append(v[i])

    iv_relevance_dict["useful"].append("target")
    # creating a parameter to update train df
    if split_max:
        import re
        def split_it(year):
            return pd.Series(re.findall('(\s\d{1,}\.\d{1,})', year))
        def sec_split(year):
            return pd.Series(re.findall('(^[-+]?\d*$)', year))

        woeDF["max"] = woeDF['Cutoff'].apply(split_it)
        woeDF["max"] = pd.to_numeric(woeDF["max"])
        woeDF["max"] = woeDF["max"].replace({"NaN":np.NaN})

        woeDF["test"] = woeDF['Cutoff'].apply(sec_split)
        woeDF["test"] = pd.to_numeric(woeDF["test"])
        woeDF["test"] = woeDF["test"].replace({"NaN":np.NaN})

        woeDF["var_max"]= woeDF[["max", "test"]].sum(axis=1, min_count=1)
        woeDF.drop(columns=["max", "test"], inplace= True)   
    return newDF, woeDF, iv_relevance_dict
   

iv_values, woeDF, iv_relevance_dict = iv_woe(train[features], 'target', bins=10, show_woe=False)


In [11]:
iv_dicts = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in iv_relevance_dict.items()]))

In [13]:
iv_dicts.to_csv("../reports/iv_features.csv")

In [14]:
woeDF.to_csv("../reports/woeDF.csv")

In [15]:
iv_values.to_csv("../reports/iv_values.csv")