In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import random
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Function to generate synthetic data
def generate_credit_risk_data(n_samples=50000, default_rate=0.10):
    """
    Generate synthetic credit risk data with specified characteristics.
    
    Parameters:
    - n_samples: Number of data points to generate
    - default_rate: Proportion of default cases (target=1)
    
    Returns:
    - DataFrame with features and target variable
    """
    # Calculate number of defaults
    n_defaults = int(n_samples * default_rate)
    n_non_defaults = n_samples - n_defaults
    
    # Generate base data with 10 informative features
    X, y = make_classification(
        n_samples=n_samples,
        n_features=10,  # We'll expand to 50 later
        n_informative=8,  # Truly informative
        n_redundant=2,   # Correlated features
        n_classes=2,
        weights=[1-default_rate, default_rate],
        random_state=42
    )
    
    # Convert to DataFrame
    feature_names = [f"feature_{i+1}" for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=feature_names)
    
    # Rename the first 10 features to meaningful names
    # Highly predictive numeric variables (6)
    df.rename(columns={
        'feature_1': 'income',
        'feature_2': 'debt_to_income_ratio',
        'feature_3': 'credit_score',
        'feature_4': 'loan_amount', 
        'feature_5': 'interest_rate',
        'feature_6': 'age',
    }, inplace=True)
    
    # Create a highly correlated feature with income
    df['total_assets'] = df['income'] * np.random.normal(3, 0.2, n_samples) + np.random.normal(0, 0.5, n_samples)
    
    # Transform features to more realistic ranges
    df['income'] = (df['income'] * 20000 + 50000).clip(10000, 250000)
    df['debt_to_income_ratio'] = (df['debt_to_income_ratio'] + 3) / 6  # Scale to 0-1 range
    df['debt_to_income_ratio'] = (df['debt_to_income_ratio'] * 0.6 + 0.1).clip(0.05, 0.8)  
    df['credit_score'] = (df['credit_score'] * 150 + 650).clip(300, 850).astype(int)
    df['loan_amount'] = (df['loan_amount'] * 50000 + 100000).clip(5000, 500000)
    df['interest_rate'] = (df['interest_rate'] + 3) / 6 * 15 + 2  # Interest rate between 2% and 17%
    df['age'] = (df['age'] * 20 + 40).clip(18, 85).astype(int)
    
    # Highly predictive categorical variables (4)
    # Feature 7: employment_status (categorical)
    employment_statuses = ['Employed', 'Self-Employed', 'Unemployed', 'Retired']
    # Make employment more predictive - separate probability arrays for default vs non-default
    employed_prob_default = [0.3, 0.2, 0.4, 0.1]
    employed_prob_nondefault = [0.7, 0.2, 0.05, 0.05]
    
    # Create array to store employment status
    employment_status = []
    for i in range(n_samples):
        if y[i] == 1:
            employment_status.append(np.random.choice(employment_statuses, p=employed_prob_default))
        else:
            employment_status.append(np.random.choice(employment_statuses, p=employed_prob_nondefault))
    
    df['employment_status'] = employment_status
    
    # Feature 8: education_level (categorical)
    education_levels = ['High School', 'Bachelor', 'Master', 'PhD', 'Other']
    # Make education more predictive - separate probability arrays
    education_prob_default = [0.5, 0.3, 0.1, 0.05, 0.05]
    education_prob_nondefault = [0.2, 0.4, 0.3, 0.08, 0.02]
    
    # Create array to store education level
    education_level = []
    for i in range(n_samples):
        if y[i] == 1:
            education_level.append(np.random.choice(education_levels, p=education_prob_default))
        else:
            education_level.append(np.random.choice(education_levels, p=education_prob_nondefault))
    
    df['education_level'] = education_level
    
    # Feature 9: loan_purpose (categorical)
    loan_purposes = ['Home', 'Auto', 'Education', 'Personal', 'Business', 'Debt Consolidation']
    # Make loan_purpose more predictive
    purpose_prob_default = [0.1, 0.15, 0.2, 0.2, 0.25, 0.1]
    purpose_prob_nondefault = [0.3, 0.2, 0.1, 0.1, 0.1, 0.2]
    
    # Create array to store loan purpose
    loan_purpose = []
    for i in range(n_samples):
        if y[i] == 1:
            loan_purpose.append(np.random.choice(loan_purposes, p=purpose_prob_default))
        else:
            loan_purpose.append(np.random.choice(loan_purposes, p=purpose_prob_nondefault))
    
    df['loan_purpose'] = loan_purpose
    
    # Feature 10: has_previous_defaults (categorical but binary)
    # Make previous defaults highly predictive
    has_previous_defaults = []
    for i in range(n_samples):
        if y[i] == 1:
            # 70% of defaulters have previous defaults
            has_previous_defaults.append(np.random.choice([1, 0], p=[0.7, 0.3]))
        else:
            # 10% of non-defaulters have previous defaults
            has_previous_defaults.append(np.random.choice([1, 0], p=[0.1, 0.9]))
    
    df['has_previous_defaults'] = [('Yes' if val == 1 else 'No') for val in has_previous_defaults]
    
    # Create a correlated categorical variable with employment_status
    # payment_history (correlated with employment_status)
    payment_history_map = {
        'Employed': np.array(['Excellent', 'Good', 'Fair', 'Poor']),
        'Self-Employed': np.array(['Good', 'Fair', 'Fair', 'Poor']),
        'Unemployed': np.array(['Fair', 'Poor', 'Poor', 'Poor']), 
        'Retired': np.array(['Excellent', 'Good', 'Fair', 'Poor'])
    }
    
    # Create probabilities for payment_history based on employment status
    payment_probs = {
        'Employed': [0.5, 0.3, 0.15, 0.05],
        'Self-Employed': [0.3, 0.4, 0.2, 0.1],
        'Unemployed': [0.1, 0.2, 0.3, 0.4],
        'Retired': [0.4, 0.3, 0.2, 0.1]
    }
    
    payment_history = []
    for status in df['employment_status']:
        payment_history.append(np.random.choice(payment_history_map[status], p=payment_probs[status]))
    
    df['payment_history'] = payment_history
    
    # Add remaining 40 less predictive features (mix of numeric and categorical)
    # Numeric features (30)
    for i in range(1, 31):
        # Generate less predictive numeric features
        feature_name = f'numeric_feature_{i}'
        if i <= 5:  # First 5 slightly more predictive than the rest
            feature_values = np.random.normal(0, 1, n_samples) + y * np.random.uniform(0.1, 0.3)
        else:  # Remaining 25 features are mostly noise
            feature_values = np.random.normal(0, 1, n_samples) + y * np.random.uniform(0, 0.1)
            
        # Apply different transformations to make features diverse
        if i % 4 == 0:
            # Exponential-like features (e.g., transaction amounts)
            feature_values = np.exp(feature_values * 0.5) * 100
        elif i % 4 == 1:
            # Percentage-like features (e.g., utilization rates)
            feature_values = stats.norm.cdf(feature_values) * 100
        elif i % 4 == 2:
            # Count-like features (e.g., number of inquiries)
            feature_values = np.abs(feature_values * 5).astype(int)
        # else leave as standard normal
            
        df[feature_name] = feature_values
    
    # Categorical features (10)
    categorical_vars = [
        ('marital_status', ['Single', 'Married', 'Divorced', 'Widowed']),
        ('housing_status', ['Own', 'Mortgage', 'Rent', 'Other']),
        ('job_industry', ['Technology', 'Healthcare', 'Finance', 'Education', 'Manufacturing', 'Retail', 'Other']),
        ('state', ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'GA', 'Other']),
        ('credit_card_type', ['Visa', 'Mastercard', 'Amex', 'Discover', 'None']),
        ('num_dependents', [0, 1, 2, 3, 4, '5+']),
        ('months_at_current_job', ['<6', '6-12', '1-3 years', '3-5 years', '5+ years']),
        ('has_cosigner', ['Yes', 'No']),
        ('account_type', ['Checking', 'Savings', 'Both', 'None']),
        ('application_channel', ['Online', 'In-person', 'Phone', 'Mail'])
    ]
    
    for i, (feature_name, categories) in enumerate(categorical_vars):
        # For first 3 categorical variables, make them slightly predictive
        if i < 3:
            # Different probability distributions based on target
            p_default = np.random.dirichlet(np.ones(len(categories)) * 2)
            p_non_default = np.random.dirichlet(np.ones(len(categories)) * 2)
            
            # Ensure some difference between distributions
            max_idx = np.argmax(p_default)
            p_default[max_idx] += 0.1
            p_default = p_default / sum(p_default)
            
            # Choose categories based on target
            cat_values = []
            for target_val in y:
                if target_val == 1:
                    cat_values.append(np.random.choice(categories, p=p_default))
                else:
                    cat_values.append(np.random.choice(categories, p=p_non_default))
        else:
            # For the rest, almost no predictive power
            cat_values = np.random.choice(categories, size=n_samples)
            
        df[feature_name] = cat_values
    
    # Add target variable
    df['default'] = y
    
    return df

# Generate the dataset
credit_risk_df = generate_credit_risk_data(n_samples=50000, default_rate=0.10)

# Verify the default rate
default_rate = credit_risk_df['default'].mean()
print(f"Default rate in the dataset: {default_rate:.4f}")

# Check correlation between income and total_assets (should be highly correlated)
income_assets_corr = credit_risk_df['income'].corr(credit_risk_df['total_assets'])
print(f"Correlation between income and total_assets: {income_assets_corr:.4f}")

# Check correlation between employment_status and payment_history (categorical correlation)
crosstab = pd.crosstab(credit_risk_df['employment_status'], credit_risk_df['payment_history'])
print("\nCrosstab of employment_status and payment_history:")
print(crosstab)

# Show information about the dataset
print("\nDataset information:")
print(f"Total rows: {len(credit_risk_df)}")
print(f"Total columns: {len(credit_risk_df.columns)}")
print(f"Default cases: {credit_risk_df['default'].sum()}")
print(f"Non-default cases: {len(credit_risk_df) - credit_risk_df['default'].sum()}")

# Show a sample of the data
print("\nSample of the generated dataset:")
print(credit_risk_df.head())

# Feature importance analysis
# We'll use correlation for numeric features and chi-square for categorical
from scipy.stats import chi2_contingency

# Analyze numeric features
numeric_features = credit_risk_df.select_dtypes(include=['float64', 'int64']).columns
numeric_features = [col for col in numeric_features if col != 'default']

print("\nCorrelation of numeric features with default:")
correlations = {}
for col in numeric_features:
    corr = credit_risk_df[col].corr(credit_risk_df['default'])
    correlations[col] = abs(corr)

sorted_numeric = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
for feature, corr in sorted_numeric[:15]:  # Show top 15
    print(f"{feature}: {corr:.4f}")

# Analyze categorical features
categorical_features = credit_risk_df.select_dtypes(include=['object']).columns
print("\nChi-square statistics for categorical features:")
chi2_values = {}

for col in categorical_features:
    contingency = pd.crosstab(credit_risk_df[col], credit_risk_df['default'])
    chi2, p, dof, expected = chi2_contingency(contingency)
    chi2_values[col] = (chi2, p)

sorted_categorical = sorted(chi2_values.items(), key=lambda x: x[1][0], reverse=True)
for feature, (chi2, p) in sorted_categorical:
    print(f"{feature}: Chi2={chi2:.2f}, p-value={p:.6f}")

# Save to CSV
credit_risk_df.to_csv('../data/credit_risk_dataset.csv', index=False)
print("\nDataset saved to 'credit_risk_dataset.csv'")

# Summary of highly predictive features
print("\nHighly predictive features:")
print("Numeric: income, debt_to_income_ratio, credit_score, loan_amount, interest_rate, age")
print("Correlated numeric pair: income and total_assets")
print("Categorical: employment_status, education_level, loan_purpose, has_previous_defaults")
print("Correlated categorical pair: employment_status and payment_history")

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate a synthetic dataset with 10 predictive features
X, y = make_classification(
    n_samples=50000,
    n_features=10,  # 10 predictive features
    n_informative=8,  # 8 truly informative features
    n_redundant=2,    # 2 redundant features from the informative ones
    n_classes=2,      # Binary target
    random_state=42
)

# Create a DataFrame with the features and target
feature_names = [f'feature_{i+1}' for i in range(10)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Add 5 redundant features
# Redundant feature 1: copy of feature_1 with noise
df['redundant_1'] = df['feature_1'] + np.random.normal(0, 0.1, size=50000)

# Redundant feature 2: linear combination of feature_2 and feature_3
df['redundant_2'] = 0.7 * df['feature_2'] + 0.3 * df['feature_3'] + np.random.normal(0, 0.05, size=50000)

# Redundant feature 3: copy of feature_4 with different scale
df['redundant_3'] = 2.5 * df['feature_4'] + np.random.normal(0, 0.1, size=50000)

# Redundant feature 4: transformation of feature_5
df['redundant_4'] = np.log(np.abs(df['feature_5']) + 1) + np.random.normal(0, 0.1, size=50000)

# Redundant feature 5: feature_6 with offset
df['redundant_5'] = df['feature_6'] + 1.5 + np.random.normal(0, 0.08, size=50000)

# Convert some numerical features to categorical
# Convert feature_3 to categorical (low, medium, high)
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
df['feature_3_cat'] = discretizer.fit_transform(df[['feature_3']])
df['feature_3_cat'] = df['feature_3_cat'].map({0.0: 'low', 1.0: 'medium', 2.0: 'high'})
df = df.drop('feature_3', axis=1)

# Convert feature_7 to binary categorical
df['feature_7_cat'] = np.where(df['feature_7'] > 0, 'yes', 'no')
df = df.drop('feature_7', axis=1)

# Convert feature_10 to multi-level categorical
discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
df['feature_10_cat'] = discretizer.fit_transform(df[['feature_10']])
df['feature_10_cat'] = df['feature_10_cat'].map({0.0: 'very_low', 1.0: 'low', 2.0: 'high', 3.0: 'very_high'})
df = df.drop('feature_10', axis=1)

# Convert redundant_5 to categorical
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
df['redundant_5_cat'] = discretizer.fit_transform(df[['redundant_5']])
df['redundant_5_cat'] = df['redundant_5_cat'].map({0.0: 'group_A', 1.0: 'group_B', 2.0: 'group_C'})
df = df.drop('redundant_5', axis=1)

# Introduce missing values
# 1. Missing values in a categorical variable (feature_3_cat)
mask_cat = np.random.choice([True, False], size=df.shape[0], p=[0.05, 0.95])  # 5% missing values
df.loc[mask_cat, 'feature_3_cat'] = np.nan

# 2. Missing values in numeric variable feature_1
mask_num1 = np.random.choice([True, False], size=df.shape[0], p=[0.08, 0.92])  # 8% missing values
df.loc[mask_num1, 'feature_1'] = np.nan

# 3. Missing values in numeric variable feature_5
mask_num2 = np.random.choice([True, False], size=df.shape[0], p=[0.1, 0.9])  # 10% missing values
df.loc[mask_num2, 'feature_5'] = np.nan

# Rename columns for better clarity
df = df.rename(columns={
    'feature_1': 'age',
    'feature_2': 'income',
    'feature_4': 'experience_years',
    'feature_5': 'credit_score',
    'feature_6': 'avg_spend',
    'feature_8': 'loyalty_score',
    'feature_9': 'satisfaction_rating',
    'feature_3_cat': 'education_level',
    'feature_7_cat': 'is_homeowner',
    'feature_10_cat': 'customer_segment',
    'redundant_1': 'demographic_index',
    'redundant_2': 'financial_status',
    'redundant_3': 'work_experience',
    'redundant_4': 'credit_index',
    'redundant_5_cat': 'customer_group'
})

# Reorder columns with target at the end
cols = df.columns.tolist()
cols.remove('target')
cols.append('target')
df = df[cols]

# Display information about the dataset
print("Dataset shape:", df.shape)
print("\nFeature types:")
print(df.dtypes)
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nSample of the dataset (first 10 rows):")
print(df.head(10))

# Save to CSV
df.to_csv('../data/synthetic_binary_classification_data.csv', index=False)
print("\nDataset saved to 'synthetic_binary_classification_data.csv'")

# Additional information about the dataset for interpretation
print("\nDataset Information:")
print("- Binary target with approximately 50% distribution")
print("- 15 features total: 10 predictive and 5 redundant")
print("- 11 numerical features and 4 categorical features")
print("- Missing values in 3 columns: education_level (cat), age (num), credit_score (num)")
print("\nRedundant features:")
print("- demographic_index: redundant with age")
print("- financial_status: redundant with income and experience_years")
print("- work_experience: redundant with experience_years")
print("- credit_index: redundant with credit_score")
print("- customer_group: redundant with avg_spend")