## Credit Card Fraud Verification using ML

In [1]:
import pandas as pd
import numpy as np

In [2]:
card = pd.read_csv("creditcard.csv")

In [3]:
card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
fraud = len(card[card["Class"]==1])
valid = len(card[card["Class"]==0])
print("Fraud:",fraud)
print("Valid:",valid)

Fraud: 492
Valid: 284315


In [5]:
credit = card.copy()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(credit, test_size = 0.2, random_state = 42)
print("Train Set:",len(train_set))
print("Test Set:",len(test_set))

Train Set: 227845
Test Set: 56962


## Correlation

In [6]:
corr_matrix = credit.corr()
corr_matrix['Class'].sort_values(ascending = False)

Class     1.000000
V11       0.154876
V4        0.133447
V2        0.091289
V21       0.040413
V19       0.034783
V20       0.020090
V8        0.019875
V27       0.017580
V28       0.009536
Amount    0.005632
V26       0.004455
V25       0.003308
V22       0.000805
V23      -0.002685
V15      -0.004223
V13      -0.004570
V24      -0.007221
Time     -0.012323
V6       -0.043643
V5       -0.094974
V9       -0.097733
V1       -0.101347
V18      -0.111485
V7       -0.187257
V3       -0.192961
V16      -0.196539
V10      -0.216883
V12      -0.260593
V14      -0.302544
V17      -0.326481
Name: Class, dtype: float64

In [7]:
credit = train_set.drop("Class", axis = 1)
credit_labels = train_set['Class'].copy()

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(credit)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [9]:
X = imputer.transform(credit)

In [10]:
credit_transform = pd.DataFrame(X, columns = credit.columns)

In [11]:
credit_transform.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,...,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0,227845.0
mean,94792.551673,0.000916,-0.000384,-0.00092,-0.001458,0.000997,-0.000613,0.00109,-0.000383,0.000961,...,0.000398,0.000215,-0.001215,0.000175,-0.000217,-0.000979,-0.000192,-3.2e-05,0.000257,88.479993
std,47488.471663,1.958337,1.656602,1.514545,1.415853,1.387667,1.336803,1.243493,1.188643,1.097553,...,0.774045,0.732617,0.725311,0.627103,0.605446,0.521764,0.482443,0.401821,0.329776,254.930277
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.320155,...,-54.49772,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-9.895244,-15.430084,0.0
25%,54161.0,-0.919918,-0.597971,-0.890786,-0.84927,-0.688802,-0.768573,-0.552156,-0.208431,-0.642386,...,-0.211969,-0.22873,-0.542809,-0.161296,-0.354887,-0.317835,-0.327476,-0.07096,-0.05298,5.6
50%,84707.0,0.017978,0.06605,0.179041,-0.020959,-0.054711,-0.274846,0.041272,0.022233,-0.050414,...,-0.062614,-0.029639,0.005491,-0.010595,0.040766,0.015101,-0.052011,0.001359,0.011366,22.0
75%,139305.0,1.315548,0.803898,1.025399,0.74163,0.611173,0.396056,0.570639,0.327504,0.59904,...,0.133017,0.18608,0.527408,0.148202,0.43979,0.350453,0.240813,0.0914,0.078464,77.1
max,172792.0,2.45493,22.057729,4.187811,16.875344,34.801666,73.301626,120.589494,20.007208,10.392889,...,39.420904,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16


## Creating a Pipeline

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("std_scaler", StandardScaler()),
])

In [13]:
credit_num_tr = my_pipeline.fit_transform(credit)
credit_num_tr

array([[ 1.02255459,  0.99785119, -0.22962626, ...,  0.11248883,
        -0.14374055, -0.30788875],
       [ 0.47128275, -0.205221  , -0.37821992, ..., -0.92189789,
        -0.43984143, -0.1670264 ],
       [ 1.15338663,  0.03655821,  0.49556347, ...,  0.51372993,
         0.21235767, -0.30004345],
       ...,
       [-0.31581527, -0.07533181,  0.59962034, ..., -0.3013968 ,
        -0.59571596, -0.33162078],
       [-0.1444891 , -1.50615534,  1.42172842, ...,  1.23673372,
         1.01755287, -0.34315338],
       [-0.38770656,  0.62923844, -0.47354037, ...,  0.00310728,
         0.11623211,  0.0961834 ]])

## Model Selection

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = SGDClassifier()
#model = DecisionTreeRegressor()
#model = RandomForestRegressor()

In [15]:
model.fit(credit_num_tr, credit_labels)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
some_data = credit.loc[:5]

In [17]:
some_labels = credit_labels.loc[:5]

In [18]:
prepared_data = my_pipeline.transform(some_data)

In [19]:
model.predict(prepared_data)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
list(some_labels)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [21]:
from sklearn.metrics import mean_squared_error
credit_predictions = model.predict(credit_num_tr)
lin_mse = mean_squared_error(credit_labels, credit_predictions)
lin_rmse = np.sqrt(lin_mse)

In [22]:
lin_mse

0.0008777897254712634

## Evaluating a model - Cross Validation

In [23]:
from sklearn.model_selection import cross_val_score
import numpy as np
scores = cross_val_score(model, credit_num_tr, credit_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse = np.sqrt(-scores)

In [24]:
rmse

array([0.02649937, 0.02962719, 0.02887701, 0.02887701, 0.03245496,
       0.0303595 , 0.0303595 , 0.03312493, 0.02810744, 0.02887765])

In [25]:
def print_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())

In [26]:
print_scores(rmse)

Scores: [0.02649937 0.02962719 0.02887701 0.02887701 0.03245496 0.0303595
 0.0303595  0.03312493 0.02810744 0.02887765]
Mean: 0.02971645731407534
Standard Deviation: 0.0018709264388965042


In [27]:
from joblib import dump, load
dump(model, 'Credit.joblib')

['Credit.joblib']

In [28]:
X_test = test_set.drop("Class", axis = 1)
Y_test = test_set["Class"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [29]:
final_rmse

0.03218350879136179

In [30]:
prepared_data

array([[ 1.02255459,  0.99785119, -0.22962626, ...,  0.11248883,
        -0.14374055, -0.30788875],
       [ 0.47128275, -0.205221  , -0.37821992, ..., -0.92189789,
        -0.43984143, -0.1670264 ],
       [ 1.15338663,  0.03655821,  0.49556347, ...,  0.51372993,
         0.21235767, -0.30004345],
       ...,
       [ 1.44840625, -0.27478652,  0.56454903, ..., -2.15775553,
        -1.86398637, -0.34315338],
       [ 1.11727253,  0.89965197, -0.16564256, ..., -0.06914567,
        -0.02560762,  0.11289389],
       [-1.99607939, -0.21798218,  0.58004826, ...,  0.63181609,
         0.24508441, -0.3326799 ]])

In [37]:
prepared_data[10]

array([-0.25559   , -2.05677659,  1.14557296, -0.28316543, -0.01985595,
       -0.61740281, -0.3589117 , -0.35120626,  1.10558556,  0.48752418,
        1.12546684,  0.46228245,  1.59458794,  0.78111182,  0.1956128 ,
       -1.16039419,  0.16260039,  0.00843909, -0.06619441,  0.87508112,
       -0.01642973, -0.65642505, -0.31593945,  0.39952418,  0.1100283 ,
        0.90417637,  0.50892484,  0.71409007, -0.97924   , -0.24602857])