In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import kaggle
from imblearn.over_sampling import SMOTE

In [2]:
# Set the path for the dataset folder
folder = 'dataset'
os.makedirs(folder, exist_ok=True)
dataset_file = os.path.join(folder, 'creditcard.csv')

In [3]:
# Check if the dataset already exists
if not os.path.exists(dataset_file):
    # Dataset not found, proceed with download
    print("Dataset not found. Downloading...")
    kaggle.api.dataset_download_files('mlg-ulb/creditcardfraud', path=folder, unzip=True)
else:
    print("Dataset already exists.")

Dataset already exists.


In [4]:
# Function to load and preprocess data
def preprocess_data(data_path, save_path=None):
    """
    Loads and preprocesses the dataset for training.

    Parameters:
        data_path (str): Path to the raw dataset.
        save_path (str, optional): Path to save the preprocessed dataset. Default is None.

    Returns:
        X_train, X_test, y_train, y_test: Preprocessed and split data.
    """

    # Load dataset
    data = pd.read_csv(data_path)

    # Check for missing values
    if data.isnull().sum().sum() > 0:
        print("Warning: Dataset contains missing values. Filling with mean.")
        data = data.fillna(data.mean())

    # Feature scaling for 'Amount' and 'Time'
    scaler = StandardScaler()
    data["scaled_amount"] = scaler.fit_transform(data["Amount"].values.reshape(-1, 1))
    data["scaled_time"] = scaler.fit_transform(data["Time"].values.reshape(-1, 1))
    data = data.drop(["Amount", "Time"], axis=1)

    # Split features and target
    X = data.drop("Class", axis=1)
    y = data["Class"]

    #Handling Imbalanced Data
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Save preprocessed dataset if save_path is provided
    if save_path:
        preprocessed_data = pd.concat([X, y], axis=1)
        preprocessed_data.to_csv(save_path, index=False)
        print(f"Preprocessed dataset saved at {save_path}")

    return X_train, X_test, y_train, y_test

In [5]:
# Example Usage
if __name__ == "__main__":
    # Paths to dataset
    raw_data_path = "dataset/creditcard.csv"  # Update with actual dataset path
    preprocessed_data_path = "dataset/processed_data.csv"

    # Preprocess the data
    try:
        X_train, X_test, y_train, y_test = preprocess_data(raw_data_path, save_path=preprocessed_data_path)
        print("Data preprocessing completed successfully.")
    except FileNotFoundError as e:
        print(e)

Preprocessed dataset saved at dataset/processed_data.csv
Data preprocessing completed successfully.


: 