## Feature WOE && IV

In [8]:
import pandas as pd
import numpy as np

In [5]:
train = pd.read_parquet("../data/train.parquet")

In [9]:
def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})

        
        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        
        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [11]:
newdf, woedf = iv_woe(data=train, target='target')

Information value of customer_ID is 6.100698
Information value of S_2 is 0.008395
Information value of P_2 is 2.982891
Information value of D_39 is 0.10178
Information value of B_1 is 1.291884
Information value of B_2 is 1.289213
Information value of R_1 is 0.603611
Information value of S_3 is 0.836053
Information value of D_41 is 0.290369
Information value of B_3 is 1.332385
Information value of D_42 is 1.575721
Information value of D_43 is 0.50723
Information value of D_44 is 1.41223
Information value of B_4 is 1.16503
Information value of D_45 is 0.675468
Information value of B_5 is 0.323848
Information value of R_2 is 0.315512
Information value of D_46 is 0.335184
Information value of D_47 is 0.500697
Information value of D_48 is 1.971191
Information value of D_49 is 0.0
Information value of B_6 is 1.495013
Information value of B_7 is 1.576316
Information value of B_8 is 0.470344
Information value of D_50 is 0.523316
Information value of D_51 is 0.576941
Information value of B_9 is

In [26]:
newdf.to_csv("../reports/IV_analysis.csv")

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']


num_cols = [c for c in train.columns if c not in [cat_cols, "customer_ID", "S_2", "target"]]

In [None]:
iv=IV()
df_iv = []
feature_iv= []

def woe_iv_dataframe(df):
   

    for c in df[FEATURES].columns:
        feats_dict={}
        
        if c in cat_cols:
            feats_dict={}
            feats_dict[c]= CategoricalFeature(df[FEATURES], c)
            feats_dict = list(feats_dict.values())
            iv_df, iv_value = iv.calculate_iv(feats_dict[0])
            df_iv.append(iv_value)
            feature_iv.append(c)
            #iv_df.to_csv(f"../reports/woe_iv_features/categorical_iv_woe_{c}.csv")          
            del feats_dict,iv_df
        else:
            feats_dict[c] = ContinuousFeature(df[FEATURES], c)
            feats_dict = list(feats_dict.values())
            iv_df, iv_value = iv.calculate_iv(feats_dict[0])
            df_iv.append(iv_value)
            feature_iv.append(c)
            #iv_df.to_csv(f"../reports/woe_iv_features/numerical_iv_woe_{c}.csv")          
            del feats_dict,iv_df
    print("End")
woe_iv_dataframe(train)


In [None]:
iv_values_df = pd.DataFrame()
iv_values_df["feature"]= feature_iv
iv_values_df["iv_value"] = df_iv

In [None]:
iv_values_df.head()

Unnamed: 0,feature,iv_value
0,P_2,3.003221
1,D_39,0.0
2,B_1,1.298186
3,B_2,1.300355
4,R_1,0.206707


In [None]:
path="C:/Users/brito/OneDrive/Documentos/1 - Data Science/3 - Projeto/04 - American Express/reports/woe_iv_features"
all_files=glob.glob(os.path.join(path, "*.csv"))

li=[]

for file in all_files:
    df= pd.read_csv(file, index_col=0, header=0)
    li.append(df) 
    

![Rules For Information Value(IV)](../references/iv_values_table.png)

In [None]:
#uselesse predictors <0.02 IV values
iv_drop_variables = iv_values_df[iv_values_df['iv_value']<= 0.02]['feature'].to_list()

In [None]:
iv_values_df[(iv_values_df['iv_value']<= 0.1 ) & (iv_values_df['iv_value']>= 0.02 )]['feature']

13       B_5
38      B_13
41       S_9
46      S_11
48      D_63
57       R_6
58      S_13
60      D_69
65      S_15
88      D_83
108     D_89
119     S_23
120     S_25
121     S_26
122    D_102
123    D_103
124    D_105
128     R_27
136     S_27
140    D_116
148    D_125
149    D_126
153     B_41
154     B_42
156    D_133
160    D_139
161    D_140
162    D_142
164    D_145
Name: feature, dtype: object

In [None]:
iv_values_df[iv_values_df['iv_value'] > 0.5]['feature'].to_list().append(iv_drop_variables)

In [None]:
# Large amount of features that are considered useless or to good to be true
len(iv_drop_variables)

70

## Any feature with strong correlation with target
### But they are highly correlated with each other as we've saw on previous analysis

### So, lets start creating a list of all highly correlated variables

In [2]:
train = pd.read_parquet("../data/processed/train.parquet")

In [None]:
# Create correlation matrix
corr_matrix = train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]

# Drop features 
#df.drop(to_drop, axis=1, inplace=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [None]:
# let's save it for further analysis

with open("../data/processed/high_correlated_features.txt", "w") as file:
    for i in to_drop:
        file.write(i)
        file.write('\n')

## Feature selection with Boruta

In [4]:
x= train.drop(['customer_ID','S_2','target'], axis=1)
y= train.target

x = x.fillna(-127)

from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(
    n_jobs = -1,
    max_depth=5
    )
boruta = BorutaPy(
    estimator=forest,
    n_estimators='auto',
    max_iter=100,

)

boruta.fit(x.values, y.values)


## Spearman correlation 
* Same issue as boruta, no computer power available to deal with the problem

In [5]:
features= train.columns.to_list()[2:-1]

X= train[features]
y= train.target
X= X.fillna(-127)


from scipy.stats import spearmanr


df_spearman= train.copy()
df_spearman= df_spearman.fillna(-127)
df_spearman.drop(["customer_ID",'S_2'], inplace=True, axis=1)


import scipy

df = pd.DataFrame()
feat1s=[]
feat2s=[]
corrs=[]
p_values=[]

for feat1 in df_spearman.columns:
    for feat2 in df_spearman.columns:
        if feat1 != feat2:
            feat1s.append(feat1)
            feat2s.append(feat2)
            corr, p_value = spearmanr(df_spearman[feat1], df_spearman[feat2])
            corrs.append(corr)
            p_values.append(p_value) 

df['Feature_1'] = feat1s
df['Feature_2'] = feat2s
df['Correlation'] = corrs
df['p_value'] = p_values
df

df.to_csv("pearson.csv")

## Create dataframe based on the corr matrix

In [11]:
corr_features = pd.read_csv("../reports/high_correlated_features.txt", header=0,names=["features"])
features_to_drop = corr_features.features.to_list()

In [12]:
train_fs = train.drop(labels= features_to_drop, axis= 1)

In [15]:
train_fs.to_parquet("../data/processed/train_fs.parquet")