In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os
import pickle
import kaggle

In [2]:
# Set the path for the dataset folder
folder = 'dataset'
os.makedirs(folder, exist_ok=True)
dataset_file = os.path.join(folder, 'creditcard.csv')

In [3]:
# Check if the dataset already exists
if not os.path.exists(dataset_file):
    # Dataset not found, proceed with download
    print("Dataset not found. Downloading...")
    kaggle.api.dataset_download_files('mlg-ulb/creditcardfraud', path=folder, unzip=True)
else:
    print("Dataset already exists.")

Dataset not found. Downloading...
Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud


In [4]:
# Function to load and preprocess data
def preprocess_data(data_path):
    """
    Loads and preprocesses the dataset, performs SMOTE oversampling, 
    and splits into train, validation, and test sets.

    Parameters:
        data_path (str): Path to the raw dataset.

    Returns:
        dict: A dictionary containing training, validation, and testing splits.
    """
    # Load dataset
    data = pd.read_csv(data_path)

    # Check for missing values
    if data.isnull().sum().sum() > 0:
        print("Warning: Dataset contains missing values. Filling with mean.")
        data = data.fillna(data.mean())

    # Feature scaling for 'Amount' and 'Time'
    scaler = StandardScaler()
    data["scaled_amount"] = scaler.fit_transform(data["Amount"].values.reshape(-1, 1))
    data["scaled_time"] = scaler.fit_transform(data["Time"].values.reshape(-1, 1))
    data = data.drop(["Amount", "Time"], axis=1)

    # Split features and target
    X = data.drop("Class", axis=1)
    y = data["Class"]

    # Handle imbalanced data using SMOTE
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Train-validation split from the training set
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    # Return the data splits
    return {
        "X":X,
        "Y":y,
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
    }


In [5]:

# Function to save data splits as .pkl files
def save_splits(data_splits, save_dir):
    """
    Saves the data splits as .pkl files.

    Parameters:
        data_splits (dict): Dictionary containing data splits.
        save_dir (str): Directory to save the .pkl files.
    """
    os.makedirs(save_dir, exist_ok=True)
    for key, value in data_splits.items():
        file_path = os.path.join(save_dir, f"{key}.pkl")
        with open(file_path, "wb") as f:
            pickle.dump(value, f)
    print(f"Data splits saved as .pkl files in directory: {save_dir}")

In [6]:
# Function to load data splits from .pkl files
def load_splits(save_dir):
    """
    Loads the data splits from .pkl files.

    Parameters:
        save_dir (str): Directory containing the .pkl files.

    Returns:
        dict: Dictionary containing the loaded data splits.
    """
    data_splits = {}
    for file_name in os.listdir(save_dir):
        if file_name.endswith(".pkl"):
            key = file_name.split(".pkl")[0]
            file_path = os.path.join(save_dir, file_name)
            with open(file_path, "rb") as f:
                data_splits[key] = pickle.load(f)
    print(f"Data splits loaded from .pkl files in directory: {save_dir}")
    return data_splits


In [7]:
# Example Usage
if __name__ == "__main__":
    # Paths to dataset and saving directory
    raw_data_path = "dataset/creditcard.csv"
    save_dir = "dataset/splits_pkl"

    # Preprocess the data
    try:
        print("Preprocessing data...")
        data_splits = preprocess_data(raw_data_path)

        # Save data splits as .pkl files
        save_splits(data_splits, save_dir)

        # Load splits for later use
        loaded_data_splits = load_splits(save_dir)

        # Access loaded data
        X_train = loaded_data_splits["X_train"]
        X_val = loaded_data_splits["X_val"]
        X_test = loaded_data_splits["X_test"]
        y_train = loaded_data_splits["y_train"]
        y_val = loaded_data_splits["y_val"]
        y_test = loaded_data_splits["y_test"]

        print("Data preprocessing and saving/loading completed successfully.")

    except FileNotFoundError as e:
        print(e)

Preprocessing data...




Data splits saved as .pkl files in directory: dataset/splits_pkl
Data splits loaded from .pkl files in directory: dataset/splits_pkl
Data preprocessing and saving/loading completed successfully.
