In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style for transparent plots
plt.style.use('default')
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['savefig.facecolor'] = 'none'

# Load data
file = "data_sample.csv"
varnames = [
    "Cash", "Inventories", "Current Assets", "Tangible Assets", "Intangible Assets", "Total Assets",
    "Accounts Receivable", "Lands and Buildings", "Equity", "Accrual for Pension Liabilities",
    "Total Current Liabilities", "Total Longterm Liabilities", "Bank Debt", "Accounts Payable",
    "Sales", "Amortization Depreciation", "Interest Expenses", "EBIT", "Operating Income",
    "Net Income", "Increase Inventories", "Increase Liabilities", "Increase Cash", "Number Employees",
    "Solvency"
]

# Read data and convert all columns to numeric
credit = pd.read_csv(file, header=None, names=varnames)

# Convert all columns to numeric, forcing errors to NaN
for col in credit.columns:
    credit[col] = pd.to_numeric(credit[col], errors='coerce')

# Drop rows with any NaN values
credit = credit.dropna()

In [9]:
# Convert Solvency to categorical (binary)
credit['Solvency'] = credit['Solvency'].astype('category')

# Split data into train and test (70/30)
train_df, test_df = train_test_split(credit, test_size=0.3, random_state=42, stratify=credit['Solvency'])

print(f"Training set size: {train_df.shape}")
print(f"Test set size: {test_df.shape}")

Training set size: (3500, 25)
Test set size: (1500, 25)


**CART**

In [28]:
X_train = train_df.drop('Solvency', axis=1)
y_train = train_df['Solvency']
X_test = test_df.drop('Solvency', axis=1)
y_test = test_df['Solvency']

tree_model = DecisionTreeClassifier(max_depth=100, random_state=42)
tree_model.fit(X_train, y_train)

# Predict probabilities
fit_tree = tree_model.predict_proba(X_test)[:, 1]

# ROC curve for Decision Tree
fpr_tree, tpr_tree, _ = roc_curve(y_test, fit_tree)
roc_auc_tree = auc(fpr_tree, tpr_tree)

# Plot ROC curve for CART
plt.figure(figsize=(6, 6))
plt.plot(fpr_tree, tpr_tree, label=f'(AUC = {roc_auc_tree:.3f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(False)  # No grid
plt.savefig('ROC_CART.png', transparent=True)
plt.close()

print(f"CART AUC: {roc_auc_tree:.3f}")

CART AUC: 0.907


**Random Forest**

In [29]:
# Second Model: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

fit_rf = rf_model.predict_proba(X_test)[:, 1]

# ROC curve for Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, fit_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curve for RF
plt.figure(figsize=(6, 6))
plt.plot(fpr_rf, tpr_rf, label=f'(AUC = {roc_auc_rf:.3f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(False)  # No grid
plt.savefig('ROC_RF.png', transparent=True)
plt.close()

print(f"Random Forest AUC: {roc_auc_rf:.3f}")

Random Forest AUC: 0.986


In [None]:
# Resampling function (equivalent to AUC function in R)
def calculate_aucs(i):
    # Set random seed
    np.random.seed(i)
    
    # Split data
    train_df_temp, test_df_temp = train_test_split(credit, test_size=0.3, random_state=i, stratify=credit['Solvency'])
    
    X_train_temp = train_df_temp.drop('Solvency', axis=1)
    y_train_temp = train_df_temp['Solvency']
    X_test_temp = test_df_temp.drop('Solvency', axis=1)
    y_test_temp = test_df_temp['Solvency']
    
    # Decision Tree
    tree_temp = DecisionTreeClassifier(random_state=i)
    tree_temp.fit(X_train_temp, y_train_temp)
    fit_tree_temp = tree_temp.predict_proba(X_test_temp)[:, 1]
    auc_tree = roc_auc_score(y_test_temp, fit_tree_temp)
    
    # Random Forest
    rf_temp = RandomForestClassifier(random_state=i)
    rf_temp.fit(X_train_temp, y_train_temp)
    fit_rf_temp = rf_temp.predict_proba(X_test_temp)[:, 1]
    auc_rf = roc_auc_score(y_test_temp, fit_rf_temp)
    
    return auc_tree, auc_rf

# Run resampling 200 times
results = []
for i in range(200):
    auc_tree, auc_rf = calculate_aucs(i)
    results.append((auc_tree, auc_rf))

# Convert to DataFrame
results_df = pd.DataFrame(results, columns=['CART_AUC', 'RF_AUC'])

# Summary statistics
print("\nResampling Results Summary:")
print(f"Mean CART AUC: {results_df['CART_AUC'].mean():.3f}")
print(f"Mean RF AUC: {results_df['RF_AUC'].mean():.3f}")
print(f"CART AUC Std: {results_df['CART_AUC'].std():.3f}")
print(f"RF AUC Std: {results_df['RF_AUC'].std():.3f}")
print(f"RF outperforms CART in {((results_df['RF_AUC'] > results_df['CART_AUC']).sum() / 200 * 100):.1f}% of cases")


Resampling Results Summary:
Mean CART AUC: 0.908
Mean RF AUC: 0.983
CART AUC Std: 0.009
RF AUC Std: 0.003
RF outperforms CART in 100.0% of cases


In [34]:
# Create comparison scatter plot (no grid) - perfectly square
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(results_df['CART_AUC'], results_df['RF_AUC'], alpha=0.6)
ax.plot([0.7, 1.0], [0.7, 1.0], 'r--', alpha=0.8)
ax.set_xlabel('CART (AUC)')
ax.set_ylabel('Random Forest (AUC)')
ax.set_xlim(0.7, 1.0)
ax.set_ylim(0.7, 1.0)
ax.grid(False)  # No grid

# This ensures perfect square aspect ratio
ax.set_box_aspect(1)

plt.tight_layout()
plt.savefig('Resampling_RF_vs_CART.png', transparent=True, dpi=300, bbox_inches='tight')
plt.close()