In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# 1. Set random seed for reproducibility
np.random.seed(42)

In [3]:
# 2. Parameters for synthetic dataset
n_samples = 5000  # number of clients
n_features = 5    # number of features

In [4]:
# 3. Define feature names
feature_names = [f'X{i+1}' for i in range(n_features)]

In [5]:
# 4. Assign true importance (weights)
# Higher weight => more influence on default
true_weights = {
    'X1': 3.0,   # High importance
    'X2': 1.5,   # Medium importance
    'X3': 1.0,   # Medium importance
    'X4': 0.0,   # No importance
    'X5': 0.0    # No importance
}

bias = -1.0  # intercept term

In [6]:
# 5. Generate random features (normal distribution)
X = np.random.normal(0, 1, size=(n_samples, n_features))


In [7]:
# 6. Calculate the linear combination for logistic function
linear_combination = np.dot(X, np.array([true_weights[f] for f in feature_names])) + bias

In [8]:
# 7. Apply sigmoid function to get probabilities
prob_default = 1 / (1 + np.exp(-linear_combination))

In [9]:
# 8. Generate binary target: 1 = default, 0 = no default
# Compare probability with a random number between 0 and 1
random_probs = np.random.rand(n_samples)
y = (random_probs < prob_default).astype(int)

In [10]:
# 9. Build final dataset
credit_data = pd.DataFrame(X, columns=feature_names)
credit_data['default'] = y

In [15]:
credit_data

Unnamed: 0,X1,X2,X3,X4,X5,default
0,0.496714,-0.138264,0.647689,1.523030,-0.234153,1
1,-0.234137,1.579213,0.767435,-0.469474,0.542560,0
2,-0.463418,-0.465730,0.241962,-1.913280,-1.724918,0
3,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,0
4,1.465649,-0.225776,0.067528,-1.424748,-0.544383,1
...,...,...,...,...,...,...
4995,0.306033,-0.936234,0.650605,-0.306595,0.663910,0
4996,0.585984,-0.243012,1.710659,-0.665336,1.348870,1
4997,1.174933,1.770882,0.174056,-0.788586,2.076797,1
4998,-0.334701,-0.813543,-0.189787,-0.981691,0.526149,0


In [14]:
credit_data.to_csv('synth_dataset.csv')