# Task 1: Data Analysis and Preprocessing
## Advanced Fraud Detection System

This notebook covers comprehensive data analysis and preprocessing for both e-commerce and credit card fraud detection datasets.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Initial Exploration

In [None]:
# Load datasets
print("Loading datasets...")
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_data = pd.read_csv('../data/IpAddress_to_Country.csv')
credit_card_data = pd.read_csv('../data/creditcard.csv')

print(f"Fraud Data Shape: {fraud_data.shape}")
print(f"IP Data Shape: {ip_data.shape}")
print(f"Credit Card Data Shape: {credit_card_data.shape}")

In [None]:
# Display basic information about datasets
print("=== FRAUD DATA INFO ===")
print(fraud_data.info())
print("\n=== FRAUD DATA HEAD ===")
print(fraud_data.head())

In [None]:
print("=== CREDIT CARD DATA INFO ===")
print(credit_card_data.info())
print("\n=== CREDIT CARD DATA HEAD ===")
print(credit_card_data.head())

In [None]:
print("=== IP DATA INFO ===")
print(ip_data.info())
print("\n=== IP DATA HEAD ===")
print(ip_data.head())

## 2. Handle Missing Values

In [None]:
# Check for missing values in all datasets
print("=== MISSING VALUES ANALYSIS ===")
print("\nFraud Data Missing Values:")
fraud_missing = fraud_data.isnull().sum()
print(fraud_missing[fraud_missing > 0])
print(f"Total missing values: {fraud_missing.sum()}")

print("\nCredit Card Data Missing Values:")
credit_missing = credit_card_data.isnull().sum()
print(credit_missing[credit_missing > 0])
print(f"Total missing values: {credit_missing.sum()}")

print("\nIP Data Missing Values:")
ip_missing = ip_data.isnull().sum()
print(ip_missing[ip_missing > 0])
print(f"Total missing values: {ip_missing.sum()}")

In [None]:
# Handle missing values if any exist
# For fraud data - fill missing numerical values with median, categorical with mode
for col in fraud_data.columns:
    if fraud_data[col].isnull().sum() > 0:
        if fraud_data[col].dtype in ['int64', 'float64']:
            fraud_data[col].fillna(fraud_data[col].median(), inplace=True)
        else:
            fraud_data[col].fillna(fraud_data[col].mode()[0], inplace=True)

# For credit card data - fill with median for numerical columns
for col in credit_card_data.columns:
    if credit_card_data[col].isnull().sum() > 0:
        credit_card_data[col].fillna(credit_card_data[col].median(), inplace=True)

print("Missing values handled successfully!")

## 3. Data Cleaning

In [None]:
# Remove duplicates
print("=== DUPLICATE ANALYSIS ===")
print(f"Fraud data duplicates: {fraud_data.duplicated().sum()}")
print(f"Credit card data duplicates: {credit_card_data.duplicated().sum()}")
print(f"IP data duplicates: {ip_data.duplicated().sum()}")

# Remove duplicates
fraud_data_clean = fraud_data.drop_duplicates()
credit_card_data_clean = credit_card_data.drop_duplicates()
ip_data_clean = ip_data.drop_duplicates()

print(f"\nAfter removing duplicates:")
print(f"Fraud data shape: {fraud_data_clean.shape}")
print(f"Credit card data shape: {credit_card_data_clean.shape}")
print(f"IP data shape: {ip_data_clean.shape}")

In [None]:
# Correct data types
print("=== DATA TYPE CORRECTION ===")

# Convert datetime columns in fraud data
fraud_data_clean['signup_time'] = pd.to_datetime(fraud_data_clean['signup_time'])
fraud_data_clean['purchase_time'] = pd.to_datetime(fraud_data_clean['purchase_time'])

# Ensure IP addresses are properly formatted
fraud_data_clean['ip_address'] = fraud_data_clean['ip_address'].astype(float)

print("Data types corrected successfully!")
print("\nFraud data types:")
print(fraud_data_clean.dtypes)

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Univariate Analysis - Fraud Data
print("=== UNIVARIATE ANALYSIS - FRAUD DATA ===")

# Target variable distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
fraud_counts = fraud_data_clean['class'].value_counts()
plt.pie(fraud_counts.values, labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%', startangle=90)
plt.title('Fraud vs Non-Fraud Distribution')

plt.subplot(2, 3, 2)
plt.hist(fraud_data_clean['purchase_value'], bins=50, alpha=0.7, edgecolor='black')
plt.title('Purchase Value Distribution')
plt.xlabel('Purchase Value')
plt.ylabel('Frequency')

plt.subplot(2, 3, 3)
plt.hist(fraud_data_clean['age'], bins=30, alpha=0.7, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(2, 3, 4)
source_counts = fraud_data_clean['source'].value_counts()
plt.bar(source_counts.index, source_counts.values)
plt.title('Traffic Source Distribution')
plt.xlabel('Source')
plt.ylabel('Count')

plt.subplot(2, 3, 5)
browser_counts = fraud_data_clean['browser'].value_counts()
plt.bar(browser_counts.index, browser_counts.values)
plt.title('Browser Distribution')
plt.xlabel('Browser')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(2, 3, 6)
sex_counts = fraud_data_clean['sex'].value_counts()
plt.bar(sex_counts.index, sex_counts.values)
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Univariate Analysis - Credit Card Data
print("=== UNIVARIATE ANALYSIS - CREDIT CARD DATA ===")

plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
credit_counts = credit_card_data_clean['Class'].value_counts()
plt.pie(credit_counts.values, labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%', startangle=90)
plt.title('Credit Card Fraud Distribution')

plt.subplot(2, 3, 2)
plt.hist(credit_card_data_clean['Amount'], bins=50, alpha=0.7, edgecolor='black')
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.yscale('log')

plt.subplot(2, 3, 3)
plt.hist(credit_card_data_clean['Time'], bins=50, alpha=0.7, edgecolor='black')
plt.title('Transaction Time Distribution')
plt.xlabel('Time (seconds)')
plt.ylabel('Frequency')

# Show some V features
for i, v_col in enumerate(['V1', 'V2', 'V3'], 4):
    plt.subplot(2, 3, i)
    plt.hist(credit_card_data_clean[v_col], bins=50, alpha=0.7, edgecolor='black')
    plt.title(f'{v_col} Distribution')
    plt.xlabel(v_col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Bivariate Analysis - Fraud Data
print("=== BIVARIATE ANALYSIS - FRAUD DATA ===")

plt.figure(figsize=(20, 15))

# Purchase value vs Fraud
plt.subplot(3, 3, 1)
fraud_data_clean.boxplot(column='purchase_value', by='class', ax=plt.gca())
plt.title('Purchase Value by Fraud Status')
plt.suptitle('')

# Age vs Fraud
plt.subplot(3, 3, 2)
fraud_data_clean.boxplot(column='age', by='class', ax=plt.gca())
plt.title('Age by Fraud Status')
plt.suptitle('')

# Source vs Fraud
plt.subplot(3, 3, 3)
source_fraud = pd.crosstab(fraud_data_clean['source'], fraud_data_clean['class'], normalize='index')
source_fraud.plot(kind='bar', ax=plt.gca())
plt.title('Fraud Rate by Source')
plt.xticks(rotation=45)
plt.legend(['Non-Fraud', 'Fraud'])

# Browser vs Fraud
plt.subplot(3, 3, 4)
browser_fraud = pd.crosstab(fraud_data_clean['browser'], fraud_data_clean['class'], normalize='index')
browser_fraud.plot(kind='bar', ax=plt.gca())
plt.title('Fraud Rate by Browser')
plt.xticks(rotation=45)
plt.legend(['Non-Fraud', 'Fraud'])

# Gender vs Fraud
plt.subplot(3, 3, 5)
sex_fraud = pd.crosstab(fraud_data_clean['sex'], fraud_data_clean['class'], normalize='index')
sex_fraud.plot(kind='bar', ax=plt.gca())
plt.title('Fraud Rate by Gender')
plt.xticks(rotation=0)
plt.legend(['Non-Fraud', 'Fraud'])

# Correlation heatmap for numerical features
plt.subplot(3, 3, 6)
numerical_cols = ['purchase_value', 'age', 'class']
correlation_matrix = fraud_data_clean[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=plt.gca())
plt.title('Correlation Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Bivariate Analysis - Credit Card Data
print("=== BIVARIATE ANALYSIS - CREDIT CARD DATA ===")

plt.figure(figsize=(15, 10))

# Amount vs Fraud
plt.subplot(2, 3, 1)
credit_card_data_clean.boxplot(column='Amount', by='Class', ax=plt.gca())
plt.title('Transaction Amount by Fraud Status')
plt.suptitle('')
plt.yscale('log')

# Time vs Fraud
plt.subplot(2, 3, 2)
fraud_transactions = credit_card_data_clean[credit_card_data_clean['Class'] == 1]['Time']
normal_transactions = credit_card_data_clean[credit_card_data_clean['Class'] == 0]['Time']
plt.hist([normal_transactions, fraud_transactions], bins=50, alpha=0.7, 
         label=['Normal', 'Fraud'], color=['blue', 'red'])
plt.title('Transaction Time Distribution by Class')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.legend()
plt.yscale('log')

# V1 vs Fraud
plt.subplot(2, 3, 3)
credit_card_data_clean.boxplot(column='V1', by='Class', ax=plt.gca())
plt.title('V1 by Fraud Status')
plt.suptitle('')

# V2 vs Fraud
plt.subplot(2, 3, 4)
credit_card_data_clean.boxplot(column='V2', by='Class', ax=plt.gca())
plt.title('V2 by Fraud Status')
plt.suptitle('')

# Correlation heatmap for selected V features
plt.subplot(2, 3, 5)
v_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'Amount', 'Class']
correlation_matrix = credit_card_data_clean[v_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=plt.gca())
plt.title('Correlation Matrix (Selected Features)')

plt.tight_layout()
plt.show()

## 5. Merge Datasets for Geolocation Analysis

In [None]:
# Convert IP addresses to integer format for merging
print("=== GEOLOCATION ANALYSIS ===")

def map_ip_to_country(ip_address, ip_data):
    """Map IP address to country using IP ranges"""
    for _, row in ip_data.iterrows():
        if row['lower_bound_ip_address'] <= ip_address <= row['upper_bound_ip_address']:
            return row['country']
    return 'Unknown'

# Sample a subset for demonstration (mapping all IPs would be computationally expensive)
sample_fraud_data = fraud_data_clean.sample(n=10000, random_state=42)

print("Mapping IP addresses to countries (sample of 10,000 records)...")
sample_fraud_data['country'] = sample_fraud_data['ip_address'].apply(
    lambda x: map_ip_to_country(x, ip_data_clean)
)

print("\nCountry distribution in sample:")
print(sample_fraud_data['country'].value_counts().head(10))

# Analyze fraud by country
country_fraud = pd.crosstab(sample_fraud_data['country'], sample_fraud_data['class'], normalize='index')
print("\nFraud rate by country (top 10):")
fraud_rates = country_fraud[1].sort_values(ascending=False).head(10)
print(fraud_rates)

In [None]:
# Visualize geolocation analysis
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
country_counts = sample_fraud_data['country'].value_counts().head(10)
plt.bar(country_counts.index, country_counts.values)
plt.title('Top 10 Countries by Transaction Volume')
plt.xlabel('Country')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
fraud_rates_plot = country_fraud[1].sort_values(ascending=False).head(10)
plt.bar(fraud_rates_plot.index, fraud_rates_plot.values)
plt.title('Top 10 Countries by Fraud Rate')
plt.xlabel('Country')
plt.ylabel('Fraud Rate')
plt.xticks(rotation=45)

plt.subplot(2, 2, 3)
# Fraud distribution by country (absolute numbers)
country_fraud_abs = pd.crosstab(sample_fraud_data['country'], sample_fraud_data['class'])
fraud_counts_by_country = country_fraud_abs[1].sort_values(ascending=False).head(10)
plt.bar(fraud_counts_by_country.index, fraud_counts_by_country.values, color='red', alpha=0.7)
plt.title('Top 10 Countries by Fraud Count')
plt.xlabel('Country')
plt.ylabel('Number of Fraud Cases')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 6. Feature Engineering

In [None]:
# Feature Engineering for Fraud Data
print("=== FEATURE ENGINEERING - FRAUD DATA ===")

# Time-based features
fraud_data_clean['hour_of_day'] = fraud_data_clean['purchase_time'].dt.hour
fraud_data_clean['day_of_week'] = fraud_data_clean['purchase_time'].dt.dayofweek
fraud_data_clean['month'] = fraud_data_clean['purchase_time'].dt.month
fraud_data_clean['is_weekend'] = fraud_data_clean['day_of_week'].isin([5, 6]).astype(int)

# Time difference between signup and purchase
fraud_data_clean['time_diff_hours'] = (
    fraud_data_clean['purchase_time'] - fraud_data_clean['signup_time']
).dt.total_seconds() / 3600

# Transaction frequency and velocity features
user_stats = fraud_data_clean.groupby('user_id').agg({
    'purchase_value': ['count', 'sum', 'mean', 'std'],
    'purchase_time': ['min', 'max']
}).reset_index()

# Flatten column names
user_stats.columns = ['user_id', 'transaction_count', 'total_spent', 'avg_transaction', 'std_transaction',
                     'first_transaction', 'last_transaction']

# Calculate transaction velocity (transactions per day)
user_stats['transaction_span_days'] = (
    user_stats['last_transaction'] - user_stats['first_transaction']
).dt.total_seconds() / (24 * 3600)
user_stats['transaction_span_days'] = user_stats['transaction_span_days'].fillna(1)  # Handle same-day transactions
user_stats['transactions_per_day'] = user_stats['transaction_count'] / user_stats['transaction_span_days']

# Merge back to main dataset
fraud_data_engineered = fraud_data_clean.merge(user_stats[['user_id', 'transaction_count', 'transactions_per_day']], 
                                              on='user_id', how='left')

print(f"Original features: {fraud_data_clean.shape[1]}")
print(f"After feature engineering: {fraud_data_engineered.shape[1]}")
print("\nNew features created:")
new_features = ['hour_of_day', 'day_of_week', 'month', 'is_weekend', 'time_diff_hours', 
               'transaction_count', 'transactions_per_day']
print(new_features)

In [None]:
# Feature Engineering for Credit Card Data
print("=== FEATURE ENGINEERING - CREDIT CARD DATA ===")

# Time-based features
credit_card_data_clean['hour'] = (credit_card_data_clean['Time'] / 3600) % 24
credit_card_data_clean['day'] = (credit_card_data_clean['Time'] / (24 * 3600)) % 7

# Amount-based features
credit_card_data_clean['amount_log'] = np.log1p(credit_card_data_clean['Amount'])
credit_card_data_clean['amount_sqrt'] = np.sqrt(credit_card_data_clean['Amount'])

# Statistical features for V columns
v_columns = [col for col in credit_card_data_clean.columns if col.startswith('V')]
credit_card_data_clean['v_mean'] = credit_card_data_clean[v_columns].mean(axis=1)
credit_card_data_clean['v_std'] = credit_card_data_clean[v_columns].std(axis=1)
credit_card_data_clean['v_sum'] = credit_card_data_clean[v_columns].sum(axis=1)

print(f"Original features: {len(v_columns) + 3}")
print(f"After feature engineering: {credit_card_data_clean.shape[1]}")
print("\nNew features created:")
new_credit_features = ['hour', 'day', 'amount_log', 'amount_sqrt', 'v_mean', 'v_std', 'v_sum']
print(new_credit_features)

In [None]:
# Visualize new features
plt.figure(figsize=(20, 15))

# Fraud data features
plt.subplot(3, 4, 1)
hour_fraud = pd.crosstab(fraud_data_engineered['hour_of_day'], fraud_data_engineered['class'], normalize='index')
hour_fraud[1].plot(kind='bar', ax=plt.gca())
plt.title('Fraud Rate by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Fraud Rate')

plt.subplot(3, 4, 2)
dow_fraud = pd.crosstab(fraud_data_engineered['day_of_week'], fraud_data_engineered['class'], normalize='index')
dow_fraud[1].plot(kind='bar', ax=plt.gca())
plt.title('Fraud Rate by Day of Week')
plt.xlabel('Day of Week (0=Monday)')
plt.ylabel('Fraud Rate')

plt.subplot(3, 4, 3)
plt.hist(fraud_data_engineered['time_diff_hours'], bins=50, alpha=0.7)
plt.title('Time Difference (Signup to Purchase)')
plt.xlabel('Hours')
plt.ylabel('Frequency')
plt.yscale('log')

plt.subplot(3, 4, 4)
plt.hist(fraud_data_engineered['transactions_per_day'], bins=50, alpha=0.7)
plt.title('Transactions Per Day')
plt.xlabel('Transactions/Day')
plt.ylabel('Frequency')
plt.yscale('log')

# Credit card data features
plt.subplot(3, 4, 5)
credit_hour_fraud = credit_card_data_clean.groupby('hour')['Class'].mean()
credit_hour_fraud.plot(kind='bar', ax=plt.gca())
plt.title('Credit Card Fraud Rate by Hour')
plt.xlabel('Hour')
plt.ylabel('Fraud Rate')

plt.subplot(3, 4, 6)
plt.hist(credit_card_data_clean['amount_log'], bins=50, alpha=0.7)
plt.title('Log-transformed Amount')
plt.xlabel('Log(Amount + 1)')
plt.ylabel('Frequency')

plt.subplot(3, 4, 7)
plt.hist(credit_card_data_clean['v_mean'], bins=50, alpha=0.7)
plt.title('Mean of V Features')
plt.xlabel('V Mean')
plt.ylabel('Frequency')

plt.subplot(3, 4, 8)
plt.hist(credit_card_data_clean['v_std'], bins=50, alpha=0.7)
plt.title('Standard Deviation of V Features')
plt.xlabel('V Std')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Encode Categorical Features

In [None]:
# Encode categorical features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

print("=== CATEGORICAL ENCODING ===")

# For fraud data
fraud_data_encoded = fraud_data_engineered.copy()

# Label encoding for ordinal-like categories
label_encoder = LabelEncoder()
categorical_cols = ['source', 'browser', 'sex']

for col in categorical_cols:
    fraud_data_encoded[f'{col}_encoded'] = label_encoder.fit_transform(fraud_data_encoded[col])

print("Categorical features encoded for fraud data:")
for col in categorical_cols:
    print(f"{col}: {fraud_data_encoded[col].unique()} -> {fraud_data_encoded[f'{col}_encoded'].unique()}")

# One-hot encoding for nominal categories (alternative approach)
fraud_data_onehot = pd.get_dummies(fraud_data_engineered, 
                                  columns=['source', 'browser', 'sex'], 
                                  prefix=['source', 'browser', 'sex'])

print(f"\nOriginal shape: {fraud_data_engineered.shape}")
print(f"After label encoding: {fraud_data_encoded.shape}")
print(f"After one-hot encoding: {fraud_data_onehot.shape}")

## 8. Normalization and Scaling

In [None]:
# Normalization and Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

print("=== NORMALIZATION AND SCALING ===")

# Prepare fraud data for scaling
# Select numerical features for scaling
fraud_numerical_features = ['purchase_value', 'age', 'hour_of_day', 'day_of_week', 'month', 
                           'is_weekend', 'time_diff_hours', 'transaction_count', 'transactions_per_day']

# Remove any features with NaN values
fraud_data_for_scaling = fraud_data_encoded[fraud_numerical_features].fillna(0)

# Apply different scaling methods
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

fraud_scaled_data = {}
for scaler_name, scaler in scalers.items():
    fraud_scaled_data[scaler_name] = pd.DataFrame(
        scaler.fit_transform(fraud_data_for_scaling),
        columns=fraud_numerical_features,
        index=fraud_data_for_scaling.index
    )

print("Scaling completed for fraud data")

# Prepare credit card data for scaling
credit_numerical_features = ['Time', 'Amount'] + v_columns + ['hour', 'day', 'amount_log', 'amount_sqrt', 'v_mean', 'v_std', 'v_sum']
credit_data_for_scaling = credit_card_data_clean[credit_numerical_features].fillna(0)

credit_scaled_data = {}
for scaler_name, scaler in scalers.items():
    credit_scaled_data[scaler_name] = pd.DataFrame(
        scaler.fit_transform(credit_data_for_scaling),
        columns=credit_numerical_features,
        index=credit_data_for_scaling.index
    )

print("Scaling completed for credit card data")

In [None]:
# Visualize the effect of different scaling methods
plt.figure(figsize=(20, 12))

# Original vs scaled distributions for fraud data
feature_to_plot = 'purchase_value'

plt.subplot(2, 4, 1)
plt.hist(fraud_data_for_scaling[feature_to_plot], bins=50, alpha=0.7)
plt.title(f'Original {feature_to_plot}')
plt.xlabel(feature_to_plot)
plt.ylabel('Frequency')

for i, (scaler_name, scaled_data) in enumerate(fraud_scaled_data.items(), 2):
    plt.subplot(2, 4, i)
    plt.hist(scaled_data[feature_to_plot], bins=50, alpha=0.7)
    plt.title(f'{scaler_name} - {feature_to_plot}')
    plt.xlabel(f'Scaled {feature_to_plot}')
    plt.ylabel('Frequency')

# Original vs scaled distributions for credit card data
credit_feature_to_plot = 'Amount'

plt.subplot(2, 4, 5)
plt.hist(credit_data_for_scaling[credit_feature_to_plot], bins=50, alpha=0.7)
plt.title(f'Original {credit_feature_to_plot}')
plt.xlabel(credit_feature_to_plot)
plt.ylabel('Frequency')
plt.yscale('log')

for i, (scaler_name, scaled_data) in enumerate(credit_scaled_data.items(), 6):
    plt.subplot(2, 4, i)
    plt.hist(scaled_data[credit_feature_to_plot], bins=50, alpha=0.7)
    plt.title(f'{scaler_name} - {credit_feature_to_plot}')
    plt.xlabel(f'Scaled {credit_feature_to_plot}')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 9. Final Data Preparation and Export

In [None]:
# Prepare final datasets for modeling
print("=== FINAL DATA PREPARATION ===")

# Fraud data - combine scaled features with categorical encoded features
fraud_final = fraud_scaled_data['StandardScaler'].copy()
fraud_final['class'] = fraud_data_encoded['class']
fraud_final['source_encoded'] = fraud_data_encoded['source_encoded']
fraud_final['browser_encoded'] = fraud_data_encoded['browser_encoded']
fraud_final['sex_encoded'] = fraud_data_encoded['sex_encoded']

# Credit card data - use scaled features
credit_final = credit_scaled_data['StandardScaler'].copy()
credit_final['Class'] = credit_card_data_clean['Class']

print(f"Final fraud data shape: {fraud_final.shape}")
print(f"Final credit card data shape: {credit_final.shape}")

# Save processed datasets
fraud_final.to_csv('../data/fraud_data_processed.csv', index=False)
credit_final.to_csv('../data/credit_card_processed.csv', index=False)

print("\nProcessed datasets saved successfully!")
print("Files saved:")
print("- ../data/fraud_data_processed.csv")
print("- ../data/credit_card_processed.csv")

In [None]:
# Summary statistics of final datasets
print("=== FINAL DATASET SUMMARY ===")

print("\nFraud Data Summary:")
print(f"Total samples: {len(fraud_final)}")
print(f"Features: {fraud_final.shape[1] - 1}")
print(f"Fraud cases: {fraud_final['class'].sum()} ({fraud_final['class'].mean()*100:.2f}%)")
print(f"Non-fraud cases: {(fraud_final['class'] == 0).sum()} ({(1-fraud_final['class'].mean())*100:.2f}%)")

print("\nCredit Card Data Summary:")
print(f"Total samples: {len(credit_final)}")
print(f"Features: {credit_final.shape[1] - 1}")
print(f"Fraud cases: {credit_final['Class'].sum()} ({credit_final['Class'].mean()*100:.2f}%)")
print(f"Non-fraud cases: {(credit_final['Class'] == 0).sum()} ({(1-credit_final['Class'].mean())*100:.2f}%)")

print("\n=== TASK 1 COMPLETED SUCCESSFULLY ===")
print("All data analysis and preprocessing steps have been completed:")
print("✅ Missing values handled")
print("✅ Data cleaning completed")
print("✅ Exploratory data analysis performed")
print("✅ Datasets merged for geolocation analysis")
print("✅ Feature engineering completed")
print("✅ Categorical features encoded")
print("✅ Normalization and scaling applied")
print("✅ Final datasets prepared and saved")