In [None]:
import matplotlib as plt
import pandas as pd
import numpy as np
import seaborn as sb
from pathlib import Path
import matplotlib.pyplot as plot

In [None]:
DIR = r"C:\Temp\ML_Fruad_Files"
DIR_PROTOCOL = r"C:\Temp\ML_Fruad_Files\Protocol"
DIR_Neighborhood = r"C:\Temp\ML_Fruad_Files\Neighborhood_Clusters"

In [None]:
# creating directory and folder
folder_path = Path(DIR_PROTOCOL)
folder_path.mkdir(parents=True, exist_ok=True)

In [None]:
df=pd.read_pickle(f'{DIR}\df_after_stage4.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29210 entries, 0 to 29209
Data columns (total 75 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   is_female                            29210 non-null  int64  
 1   lat                                  29210 non-null  float64
 2   long                                 29210 non-null  float64
 3   amt                                  29210 non-null  float64
 4   is_fraud                             29210 non-null  bool   
 5   merch_lat                            29210 non-null  float64
 6   merch_long                           29210 non-null  float64
 7   num_neighborhoods                    29210 non-null  int64  
 8   distance_merch_cust                  29210 non-null  float64
 9   is_Risk Manager_Job                  29210 non-null  bool   
 10  trans_hour                           29210 non-null  int64  
 11  num_transactions            

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('is_fraud', axis=1)
y = df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# uni variable analasys
from sklearn.feature_selection import f_classif

# Calculate F-values and p-values
f_values, p_values = f_classif(X, y)

# Create a DataFrame to display F-values and p-values for each feature
feature_scores = pd.DataFrame({'Feature': X.columns, 'F-Value': f_values, 'P-Value': p_values})

# Print the DataFrame
#print(feature_scores.sort_values('P-Value').head(20))

# Select the top 20 features based on F-value
top_features = feature_scores.nlargest(20, 'F-Value')

# Print the 20 best features
print(top_features)


                            Feature      F-Value        P-Value
24   category_group_Lifestyle(rare)  3641.207957   0.000000e+00
3                               amt  2458.258030   0.000000e+00
8               is_Risk Manager_Job  1213.284363  1.706621e-260
12                       min_amount  1056.977155  8.568675e-228
31      trans_hour_category_Evening   338.912709   2.936718e-75
9                        trans_hour   143.996018   4.259417e-33
29      trans_hour_category_Morning    99.331700   2.327301e-23
36                        cluster_1    64.879653   8.262230e-16
66           merchant_freq_Very_Low    38.951201   4.404978e-10
10                 num_transactions    32.117417   1.464960e-08
54                          month_4    27.833974   1.331258e-07
57                          month_7    23.402147   1.321010e-06
32                        age_21-50    17.965121   2.256801e-05
62                         month_12    17.832942   2.419010e-05
30    trans_hour_category_Afternoon    1

In [None]:
# multi variable analasys

In [None]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Fit Ridge model
ridge = Ridge(alpha=0.01).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

xgb = xgb.XGBClassifier(random_state=42).fit(X, y)
xgb_selected = (xgb.feature_importances_ > 0).astype(int)

rf = RandomForestClassifier(random_state=42).fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

gb = GradientBoostingClassifier(random_state=42).fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

ab = AdaBoostClassifier(random_state=42).fit(X, y)
ab_selected = (ab.feature_importances_ > 0).astype(int)

dt = DecisionTreeClassifier(random_state=42).fit(X, y)
dt_selected = (dt.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'Ridge': ridge_selected,
    'XGB': xgb_selected,
    'RandomForest': rf_selected,
    'GBC': gb_selected,
    'AdaBoost': ab_selected,
    'DecisionTree': dt_selected
})

# Sum the number of selections for each feature
#selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)
selection_df['Sum'] = selection_df[['Lasso', 'Ridge', 'XGB', 'RandomForest', 'GBC', 'AdaBoost', 'DecisionTree']].sum(axis=1)

# Output the results
#print(selection_df)

In [None]:
# Selecting variables with a sum of selections >= 5
feature_lst = selection_df[selection_df['Sum'] >= 5]['Feature'].tolist()
if "is_fraud" not in feature_lst:
    feature_lst.append('is_fraud')

i decided to took the results from Ridge,Lasso models

In [None]:
df_final = df[feature_lst].copy()

# # Output the result to verify
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29210 entries, 0 to 29209
Data columns (total 32 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   is_female                            29210 non-null  int64  
 1   lat                                  29210 non-null  float64
 2   long                                 29210 non-null  float64
 3   amt                                  29210 non-null  float64
 4   merch_lat                            29210 non-null  float64
 5   merch_long                           29210 non-null  float64
 6   num_neighborhoods                    29210 non-null  int64  
 7   distance_merch_cust                  29210 non-null  float64
 8   trans_hour                           29210 non-null  int64  
 9   num_transactions                     29210 non-null  int64  
 10  max_amount                           29210 non-null  float64
 11  min_amount                  

In [None]:
df_final.to_pickle(f'{DIR}\df_after_stage5.pkl')
df_final.to_csv(f'{DIR}\Transactions_Customers_DC_2019_After_Stage5.csv', index=False)

df.to_pickle(f'{DIR}\df_after_stage5_before_FeatureSelection.pkl')
df.to_csv(f'{DIR}\Transactions_Customers_DC_2019_After_Stage5_befor_FeatureSelection.csv', index=False)