In [1]:
import pandas as pd
from scripts.config import RAW_DATA_PATH, PROCESSED_DATA_PATH, TARGET_COLUMN
from scripts.utils import load_data, save_data

# Load the raw dataset
df = load_data(RAW_DATA_PATH)

# Display the first 5 rows of the dataset
print("First 5 rows of the raw dataset:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Info:")
print(df.info())

# Display the shape of the dataset
print("\nDataset Shape:")
print(df.shape)

# Display class distribution
print("\nClass Distribution:")
print(df[TARGET_COLUMN].value_counts())

2025-02-22 20:15:58,683 - INFO - Loading data from C:\project\-Credit-Card-Fraud-Detection\data\raw\creditcard.csv


First 5 rows of the raw dataset:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   


Observations from Step 1:
First 5 Rows:

The dataset contains 31 columns, including Time, V1 to V28 (PCA-transformed features), Amount, and Class (target variable).

Dataset Info:

The dataset has 284,807 rows and 31 columns.

All columns are of type float64 except for Class, which is int64.

There are no missing values in the dataset.

Class Distribution:

Non-Fraudulent Transactions (Class 0): 284,315

Fraudulent Transactions (Class 1): 492

The dataset is highly imbalanced, with only 0.17% of transactions being fraudulent.



In [2]:
from imblearn.over_sampling import SMOTE
from scripts.config import TARGET_COLUMN, RANDOM_STATE

# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = smote.fit_resample(X, y)

# Check the new class distribution
print("Class Distribution After SMOTE:")
print(y_res.value_counts())

Class Distribution After SMOTE:
Class
0    284315
1    284315
Name: count, dtype: int64


Class Distribution After SMOTE:

Both classes now have 284,315 samples each.

The dataset is perfectly balanced, which will help the model learn to detect fraudulent transactions more effectively.

Key Insights:

SMOTE has successfully generated synthetic samples for the minority class (fraudulent transactions).

The balanced dataset will improve the model’s ability to generalize and detect fraud.



In [3]:
from sklearn.model_selection import train_test_split
from scripts.config import TEST_SIZE, RANDOM_STATE

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# Display the shapes of the resulting datasets
print("Training Set Shape:")
print(X_train.shape, y_train.shape)

print("\nTesting Set Shape:")
print(X_test.shape, y_test.shape)

Training Set Shape:
(454904, 30) (454904,)

Testing Set Shape:
(113726, 30) (113726,)


In [5]:
import pandas as pd
from scripts.config import PROCESSED_DATA_PATH
from scripts.utils import save_data

# Combine features and target for training and testing sets
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df[TARGET_COLUMN] = y_train

test_df = pd.DataFrame(X_test, columns=X.columns)
test_df[TARGET_COLUMN] = y_test

# Convert Path object to string and modify the file names
train_path = str(PROCESSED_DATA_PATH).replace(".csv", "_train.csv")
test_path = str(PROCESSED_DATA_PATH).replace(".csv", "_test.csv")

# Save the training and testing sets
save_data(train_df, train_path)
save_data(test_df, test_path)

print(f"Training set saved to {train_path}")
print(f"Testing set saved to {test_path}")

2025-02-22 20:22:37,289 - INFO - Saving data to C:\project\-Credit-Card-Fraud-Detection\data\processed\processed_data_train.csv
2025-02-22 20:22:50,232 - INFO - Saving data to C:\project\-Credit-Card-Fraud-Detection\data\processed\processed_data_test.csv


Training set saved to C:\project\-Credit-Card-Fraud-Detection\data\processed\processed_data_train.csv
Testing set saved to C:\project\-Credit-Card-Fraud-Detection\data\processed\processed_data_test.csv
