#**Fraud Detection**

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE

Loading the Dataset

In [2]:
creditcard = pd.read_csv('/content/creditcard.csv')

Basic Exploration

Shape of the Dataset

In [3]:
print(creditcard.shape)

(5974, 31)


Information about the Dataset

In [4]:
print(creditcard.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5974 entries, 0 to 5973
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    5974 non-null   int64  
 1   V1      5974 non-null   float64
 2   V2      5974 non-null   float64
 3   V3      5974 non-null   float64
 4   V4      5974 non-null   float64
 5   V5      5974 non-null   float64
 6   V6      5974 non-null   float64
 7   V7      5974 non-null   float64
 8   V8      5974 non-null   float64
 9   V9      5974 non-null   float64
 10  V10     5974 non-null   float64
 11  V11     5974 non-null   float64
 12  V12     5974 non-null   float64
 13  V13     5974 non-null   float64
 14  V14     5974 non-null   float64
 15  V15     5974 non-null   float64
 16  V16     5974 non-null   float64
 17  V17     5974 non-null   float64
 18  V18     5973 non-null   float64
 19  V19     5973 non-null   float64
 20  V20     5973 non-null   float64
 21  V21     5973 non-null   float64
 22  

Descriptive Statistics

In [5]:
print(creditcard.describe())

              Time           V1           V2           V3           V4  \
count  5974.000000  5974.000000  5974.000000  5974.000000  5974.000000   
mean   2677.615501    -0.266159     0.285505     0.844231     0.104200   
std    1765.025532     1.395405     1.208867     1.031448     1.442339   
min       0.000000   -12.168192   -15.732974   -12.389545    -4.657545   
25%    1162.250000    -1.015749    -0.280054     0.295701    -0.839417   
50%    2537.000000    -0.420703     0.346083     0.882882     0.161767   
75%    3781.750000     1.115402     0.941548     1.504158     1.071412   
max    6645.000000     1.685314     7.467017     4.101716     6.013346   

                V5           V6           V7           V8           V9  ...  \
count  5974.000000  5974.000000  5974.000000  5974.000000  5974.000000  ...   
mean      0.000709     0.194948     0.018324    -0.039006     0.396916  ...   
std       1.185900     1.365525     1.059870     1.304005     1.047749  ...   
min     -32.09212

Data Types

In [6]:
print(creditcard.dtypes)

Time        int64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class     float64
dtype: object


In [7]:
print(creditcard.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Missing Values

In [8]:
print(creditcard.tail())

      Time        V1        V2        V3        V4        V5        V6  \
5969  6634 -1.611463  0.190648  0.901715  1.531254 -1.535865  0.799245   
5970  6635 -1.420272  1.449354  1.320110 -1.894320  0.913695  0.454601   
5971  6637 -1.206696  0.284728  2.152053 -2.850437 -0.437285 -0.238376   
5972  6644  1.067611  0.091006 -0.153917  0.704233  0.113894 -0.826866   
5973  6645 -0.535272 -0.132299  2.180041  1.018303 -1.498819  0.529570   

            V7        V8        V9  ...       V21       V22       V23  \
5969  1.513786  0.495829  0.200390  ...  0.211223  0.007477  1.026272   
5970  0.894179 -0.385450  2.433841  ... -0.529027 -0.368394 -0.247773   
5971 -0.333341  0.334679  2.870542  ...  0.039460  0.464476 -0.457193   
5972  0.567690 -0.464181  0.957295  ... -0.476723 -1.410090 -0.037550   
5973  0.420147  0.045445  1.543919  ...       NaN       NaN       NaN   

           V24       V25       V26       V27       V28  Amount  Class  
5969  0.057628 -0.024955 -0.368263  0.081684

In [9]:
print(creditcard.isnull())

       Time     V1     V2     V3     V4     V5     V6     V7     V8     V9  \
0     False  False  False  False  False  False  False  False  False  False   
1     False  False  False  False  False  False  False  False  False  False   
2     False  False  False  False  False  False  False  False  False  False   
3     False  False  False  False  False  False  False  False  False  False   
4     False  False  False  False  False  False  False  False  False  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
5969  False  False  False  False  False  False  False  False  False  False   
5970  False  False  False  False  False  False  False  False  False  False   
5971  False  False  False  False  False  False  False  False  False  False   
5972  False  False  False  False  False  False  False  False  False  False   
5973  False  False  False  False  False  False  False  False  False  False   

      ...    V21    V22    V23    V24    V25    V26    V27    V

In [10]:
print(creditcard.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64


In [11]:
print(creditcard.isnull().sum().sum())

13


In [12]:
creditcard_cleaned = creditcard.fillna(creditcard.mean())

Feature Engineering

In [13]:
creditcard_cleaned['Hour'] = creditcard_cleaned['Time'] // 3600 % 24

Data Splitting

In [53]:
X = creditcard_cleaned.drop('Class', axis=1)
y = creditcard_cleaned['Class']

In [58]:
y = y.astype(int)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Handling Class Imbalance using SMOTE

In [60]:
print(y_train.value_counts())

Class
0    4777
1       2
Name: count, dtype: int64


In [61]:
smote = SMOTE(k_neighbors=1)

In [62]:
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [42]:
print(y_resampled.value_counts())

Class
0    4777
1    4777
Name: count, dtype: int64


Enhanced Anomaly Detection Example

In [63]:
iso_forest = IsolationForest(contamination=0.01)
creditcard_cleaned['anomaly'] = iso_forest.fit_predict(X)



Model Training with Hyperparameter Tuning

In [45]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [46]:
best_model = grid_search.best_estimator_

Model Evaluation

In [64]:
y_proba = best_model.predict_proba(X_test)[:, 1]

Convert probabilities to binary predictions with a threshold of 0.5

In [65]:
y_pred = (y_proba > 0.5).astype(int)

Use binary predictions for classification report and confusion matrix

In [43]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1194
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

[[1194    0]
 [   1    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Use probabilities for roc_auc_score

In [44]:
print(roc_auc_score(y_test, y_proba))

0.2185929648241206


Summary


The code implements a comprehensive fraud detection system using logistic regression on a credit card transaction dataset. It begins by loading the dataset and performing basic exploration, including checking its shape, information, and missing values. After handling missing values, the code applies feature engineering to create a new Hour feature. The dataset is then split into training and testing sets, and SMOTE is used to address class imbalance in the training data. An anomaly detection technique is applied using Isolation Forest to identify unusual patterns. The model is trained with hyperparameter tuning using grid search to optimize its performance. Finally, the model is evaluated using predictions on the test set, generating a classification report, confusion matrix, and ROC AUC score to assess its effectiveness in distinguishing between legitimate and fraudulent transactions.