# Prediction of Fraud in Credit Card 

In [1]:
#You may require to do this pip install
#pip install imblearn

In [2]:
import pandas as pd
import numpy as np

#for balancing our data

from imblearn.combine import SMOTETomek

# creating model libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler 

from sklearn import tree

# for our metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

import warnings
import support as sp
warnings.filterwarnings("ignore")

### Dataset

In [3]:
df = pd.read_csv("../data/card_transdata.csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


### Balancing Method - SMOTE Tomek Links 

In [4]:
# we divide our dataset in predictors and response variables

X_unb = df.drop("fraud", axis = 1)
y_unb = df["fraud"] 

In [5]:
#applying the method

smt = SMOTETomek(random_state=42)

In [6]:
#we fit our balancing method to our data

X, y = smt.fit_resample(X_unb, y_unb)

### Model Fit 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)

In [8]:
y_train.describe()

count    1.459745e+06
mean     5.000702e-01
std      5.000002e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: fraud, dtype: float64

In [9]:
y_test.describe()

count    364937.000000
mean          0.499719
std           0.500001
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: fraud, dtype: float64

## Decision Tree Classifier

In [17]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

### Selección de hiperparámetros

In [18]:
# max features

max_features = np.sqrt(len(X_train.columns))
max_features

2.6457513110645907

In [19]:
y_pred_train_dt = tree.predict(X_train)
y_pred_test_dt = tree.predict(X_test)

In [20]:
results_decission_tree1 = sp.metrics(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999989,0.999989,0.999989,0.999989,0.999978,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I


In [21]:
#Our tree depth
print(tree.tree_.max_depth)

12


In [22]:
# Grid hyperparameters evaluated
# ==============================================================================
param = {"max_depth": [2,4,6,8,10,12],
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}

# Grid search with cross validation
# ==============================================================================
tree2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [25]:
%%time
tree2.fit(X_train, y_train)

Fitting 10 folds for each of 450 candidates, totalling 4500 fits
CPU times: user 22.9 s, sys: 17.6 s, total: 40.5 s
Wall time: 21min 6s


In [26]:
# Best hyperparameters with cross validation
# ==============================================================================
print("----------------------------------------")
print("Best hyperparameters (cv)")
print("----------------------------------------")
tree2.best_params_

----------------------------------------
Best hyperparameters (cv)
----------------------------------------


{'max_depth': 12,
 'max_features': 3,
 'min_samples_leaf': 50,
 'min_samples_split': 200}

In [27]:
# time to create our model with the best parameters
tree3 = DecisionTreeClassifier( max_depth =  12, max_features=3, min_samples_leaf= 50, min_samples_split=200)
tree3.fit(X_train,y_train)

In [28]:
# we do our predictions for the train and the test
y_pred_arbol_test= tree3.predict(X_test)
y_pred_arbol_train= tree3.predict(X_train)

In [29]:
results_decission_tree2 =  sp.metrics(y_test, y_pred_arbol_test, y_train, y_pred_arbol_train, "Decission Tree II")
results_decission_tree2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999644,0.999501,0.999786,0.999644,0.999288,test,Decission Tree II
1,0.999644,0.999511,0.999778,0.999645,0.999289,train,Decission Tree II


In [30]:
# we join all of our results to compare models
df_all_results = pd.concat([results_decission_tree1, results_decission_tree2], axis = 0)
df_all_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999989,0.999989,0.999989,0.999989,0.999978,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.999644,0.999501,0.999786,0.999644,0.999288,test,Decission Tree II
1,0.999644,0.999511,0.999778,0.999645,0.999289,train,Decission Tree II


## Random Forest 

In [10]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2, 4, 8, 10],
         "min_samples_split": [50, 100, 200],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 200]}


# Búsqueda por grid search con validación cruzada (cross-validarion)
# ==============================================================================
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1,  
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [11]:
%%time
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
CPU times: user 1min 27s, sys: 8.95 s, total: 1min 36s
Wall time: 12h 43min 16s


In [12]:
rf = random_forest.best_estimator_
rf

### Prediction  

In [13]:
y_pred_test_rf = rf.predict(X_test)
y_pred_train_rf = rf.predict(X_train)

In [15]:
rf_results = sp.metrics(y_test, y_pred_test_rf, y_train, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999978,1.0,0.999956,0.999978,0.999956,test,Random Forest
1,0.999971,1.0,0.999942,0.999971,0.999942,train,Random Forest


In [31]:
# we join all of our results to compare models
df_all_results = pd.concat([results_decission_tree1, results_decission_tree2, rf_results], axis = 0)
df_all_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999989,0.999989,0.999989,0.999989,0.999978,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.999644,0.999501,0.999786,0.999644,0.999288,test,Decission Tree II
1,0.999644,0.999511,0.999778,0.999645,0.999289,train,Decission Tree II
0,0.999978,1.0,0.999956,0.999978,0.999956,test,Random Forest
1,0.999971,1.0,0.999942,0.999971,0.999942,train,Random Forest


In [37]:
import pickle

# saving our random forest
with open('../data/smote_random_forest.pkl', 'wb') as rf_smote:
        pickle.dump(random_forest, rf_smote)

### Value of our features 

In [42]:
predictors_value = pd.DataFrame(
                            {'predictors': X_train.columns,
                             'value': rf.feature_importances_}
                            )
print("Value of the predictors in the model")
print("-------------------------------------------")
predictors_value.sort_values(by= "value", ascending= False)

Value of the predictors in the model
-------------------------------------------


Unnamed: 0,predictors,value
2,ratio_to_median_purchase_price,0.523155
0,distance_from_home,0.174947
6,online_order,0.145243
1,distance_from_last_transaction,0.056016
5,used_pin_number,0.053311
4,used_chip,0.038456
3,repeat_retailer,0.008873


### Second Random Forest 

In [12]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [ 8, 10],
         "min_samples_split": [50, 100],
         "max_features": [2,3], 
         "min_samples_leaf": [50, 100]}


# Búsqueda por grid search con validación cruzada (cross-validarion)
# ==============================================================================
second_random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1,  
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [13]:
%%time
second_random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
CPU times: user 2min 16s, sys: 4.05 s, total: 2min 20s
Wall time: 8h 40min 51s


In [14]:
second_rf = second_random_forest.best_estimator_
second_rf

#### Prediction 

In [16]:
y_pred_test_rf = second_rf.predict(X_test)
y_pred_train_rf = second_rf.predict(X_train)

In [17]:
rf2_results = sp.metrics(y_test, y_pred_test_rf, y_train, y_pred_train_rf, "Random Forest")
rf2_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999978,1.0,0.999956,0.999978,0.999956,test,Random Forest
1,0.999968,1.0,0.999937,0.999968,0.999937,train,Random Forest


In [19]:
predictors_value = pd.DataFrame(
                            {'predictors': X_train.columns,
                             'value': second_rf.feature_importances_}
                            )
print("Value of the predictors in the model")
print("-------------------------------------------")
predictors_value.sort_values(by= "value", ascending= False)

Value of the predictors in the model
-------------------------------------------


Unnamed: 0,predictors,value
2,ratio_to_median_purchase_price,0.528423
0,distance_from_home,0.191968
6,online_order,0.134403
1,distance_from_last_transaction,0.066517
5,used_pin_number,0.039228
4,used_chip,0.030868
3,repeat_retailer,0.008593
