# Prediction of Fraud in Credit Card 

In [1]:
import pandas as pd
import numpy as np

# librerías para crear el modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler 

from sklearn import tree

# para calcular las métricas
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

import warnings
import support as sp
warnings.filterwarnings("ignore")

## Dataset

In [3]:
df = pd.read_csv("../data/card_transdata.csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


## Model Fit

In [4]:
# we divide our dataset in predictors and response variables
X = df.drop("fraud", axis = 1) # predictor
y = df["fraud"] # response 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 666)

In [6]:
y_train.describe()

count    800000.000000
mean          0.087184
std           0.282104
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: fraud, dtype: float64

In [8]:
y_test.describe()

count    200000.000000
mean          0.088280
std           0.283702
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: fraud, dtype: float64

In [57]:
def metrics(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

## Decision Tree Classifier

In [9]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

### Selección de hiperparámetros

In [10]:
# max features

max_features = np.sqrt(len(X_train.columns))
max_features

2.6457513110645907

In [12]:
y_pred_train_dt = tree.predict(X_train)
y_pred_test_dt = tree.predict(X_test)

In [58]:
results_decission_tree1 = metrics(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999985,0.999887,0.999943,0.999915,0.999907,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I


In [17]:
#Our tree depth
print(tree.tree_.max_depth)

7


In [29]:
seven= list(range(1,8))

[1, 2, 3, 4, 5, 6, 7]

In [30]:
# Grid hyperparameters evaluated
# ==============================================================================
param = {"max_depth": seven,
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}

# Grid search with cross validation
# ==============================================================================
tree2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [33]:
%%time
tree2.fit(X_train, y_train)

Fitting 10 folds for each of 525 candidates, totalling 5250 fits
CPU times: user 25.7 s, sys: 28 s, total: 53.7 s
Wall time: 15min 29s


In [34]:
# Best hyperparameters with cross validation
# ==============================================================================
print("----------------------------------------")
print("Best hyperparameters (cv)")
print("----------------------------------------")
tree2.best_params_

----------------------------------------
Best hyperparameters (cv)
----------------------------------------


{'max_depth': 7,
 'max_features': 3,
 'min_samples_leaf': 100,
 'min_samples_split': 250}

In [35]:
# time to create our model with the best parameters

tree3 = DecisionTreeClassifier( max_depth =  7, max_features=3, min_samples_leaf= 100, min_samples_split=250)
tree3.fit(X_train,y_train)

In [36]:
# we do our predictions for the train and the test
y_pred_arbol_test= tree3.predict(X_test)
y_pred_arbol_train= tree3.predict(X_train)

In [59]:
results_decission_tree2 =  metrics(y_test, y_pred_arbol_test, y_train, y_pred_arbol_train, "Decission Tree II")
results_decission_tree2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.99598,0.962256,0.99343,0.977594,0.975387,test,Decission Tree II
1,0.995875,0.960817,0.99319,0.976735,0.974473,train,Decission Tree II


In [61]:
# we join all of our results to compare models
df_all_results = pd.concat([results_decission_tree1, results_decission_tree2], axis = 0)
df_all_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999985,0.999887,0.999943,0.999915,0.999907,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.99598,0.962256,0.99343,0.977594,0.975387,test,Decission Tree II
1,0.995875,0.960817,0.99319,0.976735,0.974473,train,Decission Tree II


## Random Forest 

In [65]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2, 4, 6, 8, 10],
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}


# Búsqueda por grid search con validación cruzada (cross-validarion)
# ==============================================================================
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1,  
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [66]:
%%time
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 750 candidates, totalling 7500 fits


KeyboardInterrupt: 