# **Perform credit card fraud detection using an in-built dataset and deploy the trained model.**

In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml

# Load the credit card fraud detection dataset
creditcard_data = fetch_openml(name='creditcard', version=1, as_frame=True)
df = creditcard_data.frame

# Display the first few rows of the dataset
display(df.head())

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Explore the data

### Subtask:
Analyze the dataset to understand its structure, features, and target variable. Handle missing values and outliers if necessary.


In [2]:
# Display basic information about the DataFrame
df.info()

# Generate descriptive statistics for numerical columns
display(df.describe())

# Check for missing values
display(df.isnull().sum())

# Examine the distribution of the target variable 'Class'
display(df['Class'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   V1      284807 non-null  float64 
 1   V2      284807 non-null  float64 
 2   V3      284807 non-null  float64 
 3   V4      284807 non-null  float64 
 4   V5      284807 non-null  float64 
 5   V6      284807 non-null  float64 
 6   V7      284807 non-null  float64 
 7   V8      284807 non-null  float64 
 8   V9      284807 non-null  float64 
 9   V10     284807 non-null  float64 
 10  V11     284807 non-null  float64 
 11  V12     284807 non-null  float64 
 12  V13     284807 non-null  float64 
 13  V14     284807 non-null  float64 
 14  V15     284807 non-null  float64 
 15  V16     284807 non-null  float64 
 16  V17     284807 non-null  float64 
 17  V18     284807 non-null  float64 
 18  V19     284807 non-null  float64 
 19  V20     284807 non-null  float64 
 20  V21     284807 non-null  f

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,2.239053e-15,...,6.406204e-16,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619
std,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,...,0.770925,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109
min,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,...,-54.49772,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0
25%,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,-0.5354257,...,-0.2117214,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6
50%,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,-0.09291738,...,-0.06248109,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0
75%,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,0.4539234,...,0.1330408,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165
max,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,...,39.4209,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16


Unnamed: 0,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0
V10,0


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


## Preprocess the data

### Subtask:
Prepare the data for model training by scaling features, handling imbalanced classes, and splitting the data into training and testing sets.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Separate features (X) and target (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets with stratification
# Keep the test set aside before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the shape of the original and resampled training data
print("Original training data shape:", X_train.shape)
print("Resampled training data shape:", X_train_resampled.shape)
print("Original training target distribution:\n", y_train.value_counts())
print("Resampled training target distribution:\n", y_train_resampled.value_counts())
print("Test target distribution:\n", y_test.value_counts())

Original training data shape: (227845, 29)
Resampled training data shape: (454902, 29)
Original training target distribution:
 Class
0    227451
1       394
Name: count, dtype: int64
Resampled training target distribution:
 Class
0    227451
1    227451
Name: count, dtype: int64
Test target distribution:
 Class
0    56864
1       98
Name: count, dtype: int64


## Train a model

### Subtask:
Choose a suitable model for fraud detection (e.g., Logistic Regression, Random Forest, or a neural network) and train it on the training data.


In [4]:
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression model
model = LogisticRegression(solver='liblinear', random_state=42)

# Train the model using the resampled training data
model.fit(X_train_resampled, y_train_resampled)

## Evaluate the model

### Subtask:
Evaluate the model's performance using appropriate metrics (e.g., precision, recall, F1-score, AUC) on the testing data.


In [5]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict classes on the test set
y_pred = model.predict(X_test)

# Predict probabilities for the positive class on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Generate and print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate and print the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.98     56962

Confusion Matrix:
[[55341  1523]
 [    8    90]]
ROC AUC Score: 0.9706652392245587


## Summary:

### Data Analysis Key Findings

*   The dataset contains 284,807 entries and 30 columns, with no missing values.
*   There is a significant class imbalance in the target variable 'Class', with 284,315 non-fraudulent transactions (Class 0) and only 492 fraudulent transactions (Class 1).
*   Feature scaling was applied to the dataset.
*   SMOTE was applied to the training data to address the class imbalance, resulting in a balanced distribution of 227,452 instances for both classes in the resampled training set.
*   A Logistic Regression model was trained on the resampled training data.
*   The model achieved a high recall of 0.92 for the fraudulent class (class 1) on the test set, indicating it identifies most fraudulent transactions.
*   The precision for the fraudulent class (class 1) is low at 0.06, suggesting a high number of false positives.
*   The confusion matrix shows 55341 true negatives, 90 true positives, 1523 false positives, and 8 false negatives.
*   The ROC AUC score is 0.9707, indicating good overall discriminative ability.
*   The trained Logistic Regression model was successfully saved to a file named `logistic_regression_model.joblib`.

### Insights or Next Steps

*   While the model has high recall for fraud detection, the low precision indicates a need to reduce false positives. Further model tuning or exploring different algorithms (e.g., tree-based models, anomaly detection) could help improve precision while maintaining high recall.
*   The saved model can now be loaded and integrated into an application or system to perform real-time or batch predictions on new credit card transactions to detect potential fraud.
