## Feature WOE && IV


In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import gc
import warnings
from src.woe_cl import Woe_Iv
from time import sleep

warnings.filterwarnings("ignore")

In [56]:
train = pd.read_parquet("../data/processed/train_w_labels.parquet")

In [57]:
class Woe_Iv:
    def __init__(self, data, target, bins):
        self.data = data
        self.target = target
        self.bins = bins
        self.newDF = pd.DataFrame()
        self.woeDF = pd.DataFrame()
        self.iv_dicts = pd.DataFrame()
        self.cols = self.data.columns
        self.lst = list()
        self.woe_df_iv_df()
        self.create_iv_features()

    def woe_df_iv_df(self):

        for ivars in self.cols[~self.cols.isin([self.target])]:
            
            if (self.data[ivars].dtype.kind in "bifc") and (
                len(np.unique(self.data[ivars])) > 10
            ):
                binned_x = pd.qcut(self.data[ivars], self.bins, duplicates="drop")
                d0 = pd.DataFrame({"x": binned_x, "y": self.data[self.target]})
            else:
                d0 = pd.DataFrame({"x": self.data[ivars], "y": self.data[self.target]})
            d0 = d0.astype({"x": str})
            d = d0.groupby("x", as_index=False, dropna=False).agg(
                {"y": ["count", "sum"]}
            )
            d.columns = ["Cutoff", "N", "Events"]
            d.insert(loc=0, column="Variable", value=ivars)

            d["% of Events"] = np.maximum(d["Events"], 0.5) / d["Events"].sum()
            d["Non-Events"] = d["N"] - d["Events"]
            d["% of Non-Events"] = (
                np.maximum(d["Non-Events"], 0.5) / d["Non-Events"].sum()
            )
            d["WoE"] = np.log(d["% of Non-Events"] / d["% of Events"])
            d["IV"] = d["WoE"] * (d["% of Non-Events"] - d["% of Events"])

            temp = pd.DataFrame(
                {"Variable": [ivars], "IV": [d["IV"].sum()]}, columns=["Variable", "IV"]
            )
            self.newDF = pd.concat([self.newDF, temp], axis=0)
            self.woeDF = pd.concat([self.woeDF, d], axis=0)

    def create_iv_features(self):
        iv_relevance_dict = {
        "not_useful": [],
        "useful": [],
        }
        for i, v in self.newDF.iterrows():
            check = v["IV"]
            if check < 0.02:
                iv_relevance_dict["not_useful"].append(v[i])
            elif 0.02 < check < 0.1:
                iv_relevance_dict["useful"].append(v[i])
            elif 0.01 <= check < 0.3:
                iv_relevance_dict["useful"].append(v[i])
            elif 0.03 <= check < 0.5:
                iv_relevance_dict["useful"].append(v[i])
            else:
                iv_relevance_dict["not_useful"].append(v[i])

        iv_relevance_dict["useful"].append("target")
        self.iv_dicts = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in iv_relevance_dict.items()]))
        # creating a parameter to update train df

    def split_max(self):
        import re

        def split_it(year):
            return pd.Series(re.findall("(\s\d{1,}\.\d{1,})", year))

        def sec_split(year):
            return pd.Series(re.findall("(^[-+]?\d*$)", year))

        self.woeDF["max"] = self.woeDF["Cutoff"].apply(split_it)
        self.woeDF["max"] = pd.to_numeric(self.woeDF["max"])
        self.woeDF["max"] = self.woeDF["max"].replace({"NaN": np.NaN})

        self.woeDF["test"] = self.woeDF["Cutoff"].apply(sec_split)
        self.woeDF["test"] = pd.to_numeric(self.woeDF["test"])
        self.woeDF["test"] = self.woeDF["test"].replace({"NaN": np.NaN})

        self.woeDF["var_max"] = self.woeDF[["max", "test"]].sum(axis=1, min_count=1)
        self.woeDF.drop(columns=["max", "test"], inplace=True)


In [58]:
woe = Woe_Iv(train, "target", 30)


In [59]:
woe.woeDF

Unnamed: 0,Variable,Cutoff,N,Events,% of Events,Non-Events,% of Non-Events,WoE,IV
0,S_2,2018-03-01,12128,3312,0.027872,8816,0.025923,-0.072503,0.000141
1,S_2,2018-03-02,10562,2988,0.025146,7574,0.022271,-0.121401,0.000349
2,S_2,2018-03-03,22887,6186,0.052058,16701,0.049108,-0.058339,0.000172
3,S_2,2018-03-04,8918,2364,0.019894,6554,0.019272,-0.031798,0.000020
4,S_2,2018-03-05,10081,2725,0.022932,7356,0.021630,-0.058471,0.000076
...,...,...,...,...,...,...,...,...,...
1,D_145,"(0.0, 1.0]",31142,10796,0.090854,20346,0.059826,-0.417810,0.012964
2,D_145,"(1.0, 2.0]",15186,5688,0.047868,9498,0.027928,-0.538796,0.010743
3,D_145,"(2.0, 3.0]",6945,2456,0.020669,4489,0.013200,-0.448423,0.003349
4,D_145,"(3.0, 6.0]",13587,5439,0.045772,8148,0.023959,-0.647341,0.014121


In [60]:
woe.iv_dicts

Unnamed: 0,not_useful,useful
0,S_2,D_42
1,P_2,B_5
2,D_39,D_46
3,B_1,D_49
4,B_2,B_8
...,...,...
92,D_144,D_141
93,,D_142
94,,D_143
95,,D_145


In [61]:
woe.iv_dicts.to_csv("../reports/iv_features_30bins.csv")

In [53]:
tets

Unnamed: 0.1,Unnamed: 0,not_useful,useful
0,0,B_1,D_39
1,1,B_2,D_42
2,2,R_1,B_5
3,3,S_3,D_46
4,4,D_41,D_49
...,...,...,...
91,91,D_126,target
92,92,B_41,
93,93,R_28,
94,94,D_141,


In [52]:
tets = pd.read_csv("../reports/iv_features.csv")

In [3]:
# cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
# num_cols = [col for col in train.columns if col not in cat_cols and col not in ["customer_ID", "S_2"]]

In [4]:
# cont_features = [col for col in train.columns if col in num_cols and len(np.unique(train[col]))>10]

In [5]:
def iv_woe(data, feature, target, bins=10, show_woe=False, show_iv=False):
    # Empty Dataframe
    iv_df, woeDF = pd.DataFrame(), pd.DataFrame()
    # Extract Column Names
    cols = [feature]
    # Run WOE and IV on all the independent variables
    for ivars in cols:
        if (data[ivars].dtype.kind in "bifc") and (len(np.unique(data[ivars])) > 10):
            binned_x = pd.qcut(data[ivars], bins, duplicates="drop")
            data[ivars] = binned_x
            data[ivars] = data[ivars].astype(str)
            d0 = pd.DataFrame({"x": binned_x, "y": data[target]})
            d0 = d0.astype({"x": str})
        else:
            d0 = pd.DataFrame({"x": data[ivars], "y": data[target]})
            data[ivars] = data[ivars]

    d = d0.groupby("x", as_index=False, dropna=False).agg({"y": ["count", "sum"]})
    d.columns = ["Cutoff", "N", "Good"]
    d.insert(loc=0, column="Variable", value=ivars)

    d["% of Good"] = np.maximum(d["Good"], 0.5) / d["Good"].sum()
    d["Bad"] = d["N"] - d["Good"]
    d["% of Bad"] = np.maximum(d["Bad"], 0.5) / d["Bad"].sum()
    d["WoE"] = np.log(d["% of Good"] / d["% of Bad"])
    d["IV"] = d["WoE"] * (d["% of Good"] - d["% of Bad"])

    temp = pd.DataFrame(
        {"Variable": [ivars], "IV": [d["IV"].sum()]}, columns=["Variable", "IV"]
    )
    iv_df = pd.concat([iv_df, temp], axis=0)
    woeDF = pd.concat([woeDF, d], axis=0)
    return iv_df, woeDF

In [6]:
variaveis = train.columns
# df1 = {}
l = 0
for col in variaveis:
    if col == "target" or col == "customer_ID" or col == "S_2":
        continue
    else:
        print("WoE e IV: {}".format(col))
        iv, df = iv_woe(train, col, "target", bins=20)
        # df1[l] = df
        # df1[l]["nome"] = "WOE_"+col
        print(l)
        l = l + 1
        # print(tabulate(df, headers="keys"))
        # print("IV score: {:.2f}".format(iv))
        print("\n")
        i = 0
        for i in range(0, len(df)):
            train.loc[train[col] == df.iloc[i, 1], col] = df.iloc[i, 7]
        del df, iv
        gc.collect()

WoE e IV: P_2
0


WoE e IV: D_39
1


WoE e IV: B_1
2


WoE e IV: B_2
3


WoE e IV: R_1
4


WoE e IV: S_3
5


WoE e IV: D_41
6


WoE e IV: B_3
7


WoE e IV: D_42
8


WoE e IV: D_43
9


WoE e IV: D_44
10


WoE e IV: B_4
11


WoE e IV: D_45
12


WoE e IV: B_5
13


WoE e IV: R_2
14


WoE e IV: D_46
15


WoE e IV: D_47
16


WoE e IV: D_48
17


WoE e IV: D_49
18


WoE e IV: B_6
19


WoE e IV: B_7
20


WoE e IV: B_8
21


WoE e IV: D_50
22


WoE e IV: D_51
23


WoE e IV: B_9
24


WoE e IV: R_3
25


WoE e IV: D_52
26


WoE e IV: P_3
27


WoE e IV: B_10
28


WoE e IV: D_53
29


WoE e IV: S_5
30


WoE e IV: B_11
31


WoE e IV: S_6
32


WoE e IV: D_54
33


WoE e IV: R_4
34


WoE e IV: S_7
35


WoE e IV: B_12
36


WoE e IV: S_8
37


WoE e IV: D_55
38


WoE e IV: D_56
39


WoE e IV: B_13
40


WoE e IV: R_5
41


WoE e IV: D_58
42


WoE e IV: S_9
43


WoE e IV: B_14
44


WoE e IV: D_59
45


WoE e IV: D_60
46


WoE e IV: D_61
47


WoE e IV: B_15
48


WoE e IV: S_11
49


WoE e IV: D_62
50


WoE e IV: D_

In [7]:
train.to_parquet("../data/processed/train_woebalanced_30bins.parquet")