# Prediction of Fraud in Credit Card 

In [2]:
import pandas as pd
import numpy as np

# creating model libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler 

from sklearn import tree

# for our metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

import warnings
from src import support as sp
warnings.filterwarnings("ignore")

### Dataset

In [2]:
df = pd.read_csv("../data/card_transdata.csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


### Estandarización - Sklearn RobustScaler

Nos decantamos por esta estandarización dada la cantidad de outliers que tenemos en nuestros datos. 

In [3]:
columnas_numeric = df.drop(["repeat_retailer", "used_chip","used_pin_number","online_order","fraud"], axis=1).select_dtypes(include = np.number)
columnas_numeric

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price
0,57.877857,0.311140,1.945940
1,10.829943,0.175592,1.294219
2,5.091079,0.805153,0.427715
3,2.247564,5.600044,0.362663
4,44.190936,0.566486,2.222767
...,...,...,...
999995,2.207101,0.112651,1.626798
999996,19.872726,2.683904,2.778303
999997,2.914857,1.472687,0.218075
999998,4.258729,0.242023,0.475822


In [4]:
num_col = columnas_numeric.columns

In [5]:
# we build our RobustScaler
robust = RobustScaler()

In [6]:
# we fit our model on our dataset
robust.fit(df[num_col])

In [7]:
# we transform our data
X_robust = robust.transform(df[num_col])

In [8]:
df[num_col] = X_robust
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,2.19108,-0.224744,0.585071,1.0,1.0,0.0,0.0,0.0
1,0.03943,-0.269055,0.182947,1.0,0.0,0.0,0.0,0.0
2,-0.223026,-0.063254,-0.351702,1.0,0.0,0.0,1.0,0.0
3,-0.353069,1.504177,-0.39184,1.0,1.0,0.0,1.0,0.0
4,1.565134,-0.141273,0.755879,1.0,1.0,0.0,1.0,0.0


### Model Fit

In [9]:
# we divide our dataset in predictors and response variables
X = df.drop("fraud", axis = 1) # predictor
y = df["fraud"] # response 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 666)

In [42]:
X_train.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
count,800000.0,800000.0,800000.0,800000.0,800000.0,800000.0,800000.0
mean,0.761114,1.324237,0.508911,0.881576,0.35057,0.100344,0.649986
std,2.897744,8.785769,1.732657,0.323109,0.477149,0.300458,0.476974
min,-0.455634,-0.32635,-0.612895,0.0,0.0,0.0,0.0
25%,-0.278405,-0.229466,-0.322272,1.0,0.0,0.0,0.0
50%,0.000407,-5e-05,-0.000905,1.0,0.0,0.0,1.0
75%,0.723482,0.769142,0.6761,1.0,1.0,0.0,1.0
max,264.703691,3873.752027,164.623761,1.0,1.0,1.0,1.0


In [44]:
X_test.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.765353,1.302869,0.514077,0.881375,0.349715,0.101665,0.652815
std,3.335956,6.934319,1.706199,0.323348,0.476881,0.302208,0.476076
min,-0.454422,-0.326416,-0.61183,0.0,0.0,0.0,0.0
25%,-0.278837,-0.229503,-0.321449,1.0,0.0,0.0,0.0
50%,-0.001536,0.000274,0.003305,1.0,0.0,0.0,1.0
75%,0.71399,0.776571,0.684584,1.0,1.0,0.0,1.0
max,485.812085,692.660214,76.152506,1.0,1.0,1.0,1.0


In [11]:
y_train.describe()

count    800000.000000
mean          0.087184
std           0.282104
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: fraud, dtype: float64

In [12]:
y_test.describe()

count    200000.000000
mean          0.088280
std           0.283702
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: fraud, dtype: float64

In [18]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

## Decision Tree Classifier

In [13]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

### Selección de hiperparámetros

In [14]:
# max features

max_features = np.sqrt(len(X_train.columns))
max_features

2.6457513110645907

In [19]:
results_decission_tree1 = metricas(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999985,0.999887,0.999943,0.999915,0.999907,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I


In [20]:
y_pred_train_dt = tree.predict(X_train)
y_pred_test_dt = tree.predict(X_test)

In [21]:
#Our tree depth
print(tree.tree_.max_depth)

7


In [22]:
seven= list(range(1,8))

In [23]:
# Grid hyperparameters evaluated
# ==============================================================================
param = {"max_depth": seven,
         "min_samples_split": [50, 100, 150, 200, 250],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 150, 200, 250]}

# Grid search with cross validation
# ==============================================================================
tree2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [24]:
%%time
tree2.fit(X_train, y_train)

Fitting 10 folds for each of 525 candidates, totalling 5250 fits
CPU times: user 25 s, sys: 30.2 s, total: 55.2 s
Wall time: 25min 18s


In [25]:
# Best hyperparameters with cross validation
# ==============================================================================
print("----------------------------------------")
print("Best hyperparameters (cv)")
print("----------------------------------------")
tree2.best_params_

----------------------------------------
Best hyperparameters (cv)
----------------------------------------


{'max_depth': 7,
 'max_features': 3,
 'min_samples_leaf': 100,
 'min_samples_split': 200}

In [26]:
# time to create our model with the best parameters

tree3 = DecisionTreeClassifier( max_depth =  7, max_features=3, min_samples_leaf= 100, min_samples_split=200)
tree3.fit(X_train,y_train)

In [27]:
# we do our predictions for the train and the test

y_pred_arbol_test= tree3.predict(X_test)
y_pred_arbol_train= tree3.predict(X_train)

In [29]:
results_decission_tree2 =  metricas(y_test, y_pred_arbol_test, y_train, y_pred_arbol_train, "Decission Tree II")
results_decission_tree2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999145,0.993453,0.996885,0.995166,0.994697,test,Decission Tree II
1,0.999274,0.993986,0.997706,0.995843,0.995445,train,Decission Tree II


In [30]:
# we join all of our results to compare models
df_all_results = pd.concat([results_decission_tree1, results_decission_tree2], axis = 0)
df_all_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999985,0.999887,0.999943,0.999915,0.999907,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.999145,0.993453,0.996885,0.995166,0.994697,test,Decission Tree II
1,0.999274,0.993986,0.997706,0.995843,0.995445,train,Decission Tree II


## Random Forest 

In [34]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param = {"max_depth": [2, 4, 8, 10],
         "min_samples_split": [50, 150, 200],
         "max_features": [1,2,3], 
         "min_samples_leaf": [50, 100, 150, 200]}


# Búsqueda por grid search con validación cruzada (cross-validarion)
# ==============================================================================
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        n_jobs = -1,
        verbose = 1,  
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [35]:
%%time
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
CPU times: user 51.4 s, sys: 26.5 s, total: 1min 17s
Wall time: 11h 35min 16s


In [39]:
rf = random_forest.best_estimator_
rf

### Prediction  

In [40]:
y_pred_test_rf = rf.predict(X_test)
y_pred_train_rf = rf.predict(X_train)

In [45]:
rf_results = metricas(y_test, y_pred_test_rf, y_train, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.99996,1.0,0.999547,0.999773,0.999751,test,Random Forest
1,0.999984,1.0,0.999814,0.999907,0.999898,train,Random Forest


In [46]:
# we join all of our results to compare models
df_all_results = pd.concat([results_decission_tree1, results_decission_tree2, rf_results], axis = 0)
df_all_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.999985,0.999887,0.999943,0.999915,0.999907,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.999145,0.993453,0.996885,0.995166,0.994697,test,Decission Tree II
1,0.999274,0.993986,0.997706,0.995843,0.995445,train,Decission Tree II
0,0.99996,1.0,0.999547,0.999773,0.999751,test,Random Forest
1,0.999984,1.0,0.999814,0.999907,0.999898,train,Random Forest
