In [1]:
%pip install tabulate

import pandas as pd

# Load the dataset
df = pd.read_csv('Base.csv')

# Basic dataset info
dataset_info = {
    "Total Instances": len(df),
    "Number of Features": len(df.columns),
    "Missing Values": df.isnull().sum().sum(),
    "Duplicate Rows": df.duplicated().sum(),
    "Class Distribution (fraud_bool)": dict(df['fraud_bool'].value_counts())
}

# Numerical features summary
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_stats = df[numerical_cols].describe().transpose()

# Categorical features summary
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_stats = pd.DataFrame({
    'Unique Values': df[categorical_cols].nunique(),
    'Most Frequent Value': df[categorical_cols].mode().iloc[0],
    'Frequency of Most Common': [df[col].value_counts().iloc[0] for col in categorical_cols]
})

# Feature correlations with target
correlations = df[numerical_cols].corrwith(df['fraud_bool']).sort_values(ascending=False)

# Generate report-ready output
report = f"""
Dataset Statistics Report
-------------------------

Basic Information:
- Total instances: {dataset_info['Total Instances']:,}
- Number of features: {dataset_info['Number of Features']}
- Missing values: {dataset_info['Missing Values']}
- Duplicate rows: {dataset_info['Duplicate Rows']}
- Fraud class distribution: 
  • Legitimate (0): {dataset_info['Class Distribution (fraud_bool)'][0]:,} 
  • Fraudulent (1): {dataset_info['Class Distribution (fraud_bool)'][1]:,}

Numerical Features Summary:
{numerical_stats.round(2).to_markdown()}

Categorical Features Summary:
{categorical_stats.to_markdown()}

Top Correlations with Fraud Class:
{correlations.head(10).to_markdown()}
"""

print(report)

# Optional: Save to file
with open("dataset_statistics_report.md", "w") as f:
    f.write(report)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip



Dataset Statistics Report
-------------------------

Basic Information:
- Total instances: 1,000,000
- Number of features: 32
- Missing values: 0
- Duplicate rows: 0
- Fraud class distribution: 
  • Legitimate (0): 988,971 
  • Fraudulent (1): 11,029

Numerical Features Summary:
|                                  |   count |    mean |     std |     min |     25% |     50% |     75% |      max |
|:---------------------------------|--------:|--------:|--------:|--------:|--------:|--------:|--------:|---------:|
| fraud_bool                       |   1e+06 |    0.01 |    0.1  |    0    |    0    |    0    |    0    |     1    |
| income                           |   1e+06 |    0.56 |    0.29 |    0.1  |    0.3  |    0.6  |    0.8  |     0.9  |
| name_email_similarity            |   1e+06 |    0.49 |    0.29 |    0    |    0.23 |    0.49 |    0.76 |     1    |
| prev_address_months_count        |   1e+06 |   16.72 |   44.05 |   -1    |   -1    |   -1    |   12    |   383    |
| current_a

  c /= stddev[:, None]
  c /= stddev[None, :]
