# Data Preprocessing for Fraud Detection

In this notebook, we will focus on cleaning and preprocessing the dataset to prepare it for feature engineering and modeling. The tasks include handling missing values, removing duplicates, and correcting data types.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from src.preprocessing.clean import remove_duplicates, correct_data_types
from src.preprocessing.impute import impute_missing_values
from src.utils.logger import setup_logger

# Setup logger
logger = setup_logger()

In [2]:
# Load the dataset
data_path = '../data/raw/fraud_data.csv'
df = pd.read_csv(data_path)
logger.info('Data loaded successfully.')

In [3]:
# Display initial data information
df.info()

In [4]:
# Remove duplicates
df = remove_duplicates(df)
logger.info('Duplicates removed.')

In [5]:
# Handle missing values
df = impute_missing_values(df)
logger.info('Missing values imputed.')

In [6]:
# Correct data types
df = correct_data_types(df)
logger.info('Data types corrected.')

In [7]:
# Display cleaned data information
df.info()

In [8]:
# Save the cleaned dataset
cleaned_data_path = '../data/interim/cleaned_fraud_data.csv'
df.to_csv(cleaned_data_path, index=False)
logger.info('Cleaned data saved successfully.')