# Correlation Anaylsis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from lightgbm import LGBMClassifier

In [None]:
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")

test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [None]:
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))

In [None]:
def high_correlated_cols(dataframe, plot=True, corr_th=0.85):
    # Filter numeric columns
    numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
    dataframe = dataframe[numeric_cols]
    
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    
    if plot:
        sns.set(rc={'figure.figsize': (25, 25)})
        sns.heatmap(corr, cmap="RdBu_r", annot=True, fmt='.2f', square=True, linewidths=1, vmin=-1, vmax=1)
        plt.show()
    
    return drop_list

# V features

In [None]:
cols = ['V'+str(x) for x in range(1,26)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list1 = high_correlated_cols(train_df[cols], plot=False)


In [None]:
drop_list1

In [None]:
cols = ['V'+str(x) for x in range(26,51)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list2 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list2

In [None]:
cols = ['V'+str(x) for x in range(51,76)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list3 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list3

In [None]:
cols = ['V'+str(x) for x in range(76,101)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list4 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list4

In [None]:
cols = ['V'+str(x) for x in range(101,126)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list5 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list5

In [None]:
cols = ['V'+str(x) for x in range(126,151)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list6 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list6

In [None]:
cols = ['V'+str(x) for x in range(151,176)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list7 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list7

In [None]:
cols = ['V'+str(x) for x in range(176,201)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list8 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list8

In [None]:
cols = ['V'+str(x) for x in range(201,226)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list9 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list9

In [None]:
cols = ['V'+str(x) for x in range(226,251)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list10 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list10

In [None]:
cols = ['V'+str(x) for x in range(251,276)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list11 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list11

In [None]:
cols = ['V'+str(x) for x in range(276,301)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list12 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list12

In [None]:
cols = ['V'+str(x) for x in range(301,326)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list13 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list13

In [None]:
cols = ['V'+str(x) for x in range(326,340)]
cols = cols + ["isFraud"]
high_correlated_cols(train_df[cols], plot=True)
drop_list14 = high_correlated_cols(train_df[cols], plot=False)

In [None]:
drop_list14

In [None]:
drop_list_vfeatures = drop_list1 + drop_list2 + drop_list3 + drop_list4 + drop_list5 + drop_list6 + drop_list7 + drop_list8 + \
drop_list9 + drop_list10 + drop_list11 + drop_list12 + drop_list13 + drop_list14

In [None]:
len(drop_list_vfeatures)

In [None]:
vfeatures = ['V'+str(x) for x in range(1,340)]

## I'm reanalyzing for the remaining v variables

In [None]:
reduce_vfeatures = [col for col in vfeatures if col not in drop_list_vfeatures]
drop_list_last = high_correlated_cols(train_df[reduce_vfeatures], plot=False)

In [None]:
drop_list_last

In [None]:
drop_list_vfeatures = drop_list_vfeatures + drop_list_last

In [None]:
len(drop_list_vfeatures)

In [None]:
reduce_vfeatures = [col for col in vfeatures if col not in drop_list_vfeatures]

# ID columns

In [None]:
id_cols = [col for col in train_df.columns if col[0]+col[1] == 'id']
id_cols = id_cols + ["isFraud"]
high_correlated_cols(train_df[id_cols], plot=True)
drop_list_id = high_correlated_cols(train_df[id_cols], plot=False)

In [None]:
drop_list_id

# C columns

In [None]:
c_cols = [col for col in train_df.columns if col[0] == 'C']
c_cols = c_cols + ["isFraud"]
high_correlated_cols(train_df[c_cols], plot=True)
drop_list_c = high_correlated_cols(train_df[c_cols], plot=False)

In [None]:
drop_list_c

# D columns

In [None]:
d_cols = [col for col in train_df.columns if col[0] == 'D']
d_cols = d_cols + ["isFraud"]
high_correlated_cols(train_df[d_cols], plot=True)
drop_list_d = high_correlated_cols(train_df[d_cols], plot=False)

In [None]:
drop_list_d

# V Variables with Low Effect by Feature Importance

In [None]:
vfeatures_importance = []
for i in train_df.columns:
    if 'V' in i:
        vfeatures_importance.append(i)

In [None]:
vfeatures_importance.append('isFraud')

In [None]:
len(vfeatures_importance)

In [None]:
v_train = train_df[:412785]
v_cv = train_df[412785:]

In [None]:
v_train = v_train[vfeatures_importance]
v_cv = v_cv[vfeatures_importance]

In [None]:
v_train_x  = v_train.drop(['isFraud'],axis=1)
v_train_y = v_train['isFraud']
v_cv_x = v_cv.drop(['isFraud'],axis=1)
v_cv_y = v_cv['isFraud']

In [None]:
v_train_x.fillna(v_train_x.mean(),inplace=True)
v_cv_x.fillna(v_cv_x.mean(),inplace=True)

In [None]:
rf = LGBMClassifier()
rf.fit(v_train_x,v_train_y)
predict_y_=rf.predict_proba(v_train_x)
predict_y = rf.predict_proba(v_cv_x)
print('train auc:',roc_auc_score(v_train_y,predict_y_[:,1]))
print('cv auc:',roc_auc_score(v_cv_y,predict_y[:,1]))

In [None]:
g = rf.feature_importances_
j=1
vremove=[]
for i in g:
    if i<10:
        vremove.append('V'+str(j))
    j+=1

In [None]:
for i in vremove:
    vfeatures_importance.remove(i)
    print('removed:',i)

In [None]:
drop_list_vfeatures = [col for col in vremove if col in  drop_list_vfeatures] 

# Final

In [None]:
drop_total = drop_list_vfeatures + drop_list_id + drop_list_c + drop_list_d
drop_total

In [None]:
reduce_total = [col for col in train_df.columns if col not in drop_total]
reduce_total

In [None]:
new_cols = [col for col in train_df.columns if col not in vremove]

In [None]:
new_cols2 = [col for col in test_df.columns if col in new_cols]

In [None]:
train_df.drop(drop_total, inplace=True, axis=1)

In [None]:
test_df.drop(drop_total, inplace=True, axis=1)

In [None]:
train_df.shape

In [None]:
test_df.shape