In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Generate synthetic data
def generate_synthetic_data():
    np.random.seed(42)
    n_samples = 1000
    data = {
        "Income": np.random.randint(20000, 100000, n_samples),
        "Age": np.random.randint(21, 70, n_samples),
        "LoanAmount": np.random.randint(5000, 50000, n_samples),
        "CreditHistory": np.random.choice([0, 1], size=n_samples, p=[0.2, 0.8]),
        "Education": np.random.choice(["Graduate", "Not Graduate"], size=n_samples),
        "MaritalStatus": np.random.choice(["Married", "Single"], size=n_samples),
        "Creditworthiness": np.random.choice([0, 1], size=n_samples, p=[0.3, 0.7])
    }
    return pd.DataFrame(data)

# Step 2: Load the synthetic data
data = generate_synthetic_data()

# Display the first few rows of the dataset
print("Preview of the synthetic dataset:")
print(data.head())

# Step 3: Preprocess the data
X = data.drop('Creditworthiness', axis=1)
y = data['Creditworthiness']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the classification model (Random Forest in this case)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test)

# Step 7: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Additional Step: Feature importance (optional)
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importances:")
print(feature_importances)


Preview of the synthetic dataset:
   Income  Age  LoanAmount  CreditHistory     Education MaritalStatus  \
0   35795   46       36116              0  Not Graduate        Single   
1   20860   31       38768              1      Graduate        Single   
2   96820   58       30426              1  Not Graduate       Married   
3   74886   22       22772              1      Graduate       Married   
4   26265   27       43218              1      Graduate       Married   

   Creditworthiness  
0                 1  
1                 0  
2                 1  
3                 1  
4                 1  
Accuracy of the model: 59.50%

Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.07      0.09        58
           1       0.68      0.81      0.74       142

    accuracy                           0.59       200
   macro avg       0.40      0.44      0.41       200
weighted avg       0.52      0.59      0.55       200


Confusion Mat