## 违约率计算实现代码

实现目标： 自变量——进项每月平均交易金额	进项负数发票率 进项发票作废率	进项每月平均交易次数 、销项每月平均交易金额	销项负数发票率 销项发票作废率	销项每月平均交易次数、信誉评级数值    因变量——是否违约布尔值


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
# 加载 Excel 数据文件到 DataFrame
df = pd.read_excel('./Data&Question/final_merged_data.xlsx', engine='openpyxl')

# Define the feature columns and target column
X = df[['进项每月平均交易金额', '进项负数发票率', '进项发票作废率', '进项每月平均交易次数',
        '销项每月平均交易金额', '销项负数发票率', '销项发票作废率', '销项每月平均交易次数', '信誉评级数值']]
y = df['是否违约布尔值']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict_proba(X_test)[:, 1]

# Calculate and print the AUC score
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score:.2f}")

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, (y_pred > 0.5).astype(int))
print(f"Accuracy Score: {accuracy:.2f}")

# Display the predicted default probability for each company in the test set
default_probabilities = model.predict_proba(X)[:, 1]
df['违约率'] = default_probabilities

# Display the DataFrame with predicted default rates
print(df[['企业代号', '违约率']])

In [None]:
# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Make predictions on the test set using logistic regression
y_pred_logistic = logistic_model.predict_proba(X_test)[:, 1]

# Calculate and print the AUC and accuracy score for logistic regression
auc_score_logistic = roc_auc_score(y_test, y_pred_logistic)
print(f"Logistic Regression AUC Score: {auc_score_logistic:.2f}")

accuracy_logistic = accuracy_score(y_test, (y_pred_logistic > 0.5).astype(int))
print(f"Logistic Regression Accuracy Score: {accuracy_logistic:.2f}")

# Predict default probability for each company using logistic regression
default_probabilities_logistic = logistic_model.predict_proba(X)[:, 1]
df['逻辑回归违约率'] = default_probabilities_logistic
print(df[['企业代号', '逻辑回归违约率']])


In [None]:
# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set using random forest
y_pred_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate and print the AUC and accuracy score for random forest
auc_score_rf = roc_auc_score(y_test, y_pred_rf)
print(f"Random Forest AUC Score: {auc_score_rf:.2f}")

accuracy_rf = accuracy_score(y_test, (y_pred_rf > 0.5).astype(int))
print(f"Random Forest Accuracy Score: {accuracy_rf:.2f}")

# Predict default probability for each company using random forest
default_probabilities_rf = rf_model.predict_proba(X)[:, 1]
df['随机森林违约率'] = default_probabilities_rf

# Display the DataFrame with predicted default rates
print(df[['企业代号','随机森林违约率']])

In [None]:
from scipy.optimize import minimize

# Define the objective function to minimize (negative AUC score)
def objective(weights):
    y_pred_ensemble = (weights[0] * y_pred_logistic) + (weights[1] * y_pred_rf) + (weights[2] * y_pred)
    return -roc_auc_score(y_test, y_pred_ensemble)

# Constraints: weights must sum to 1
constraints = ({'type': 'eq', 'fun': lambda weights: 1 - sum(weights)})

# Bounds: weights must be between 0 and 1
bounds = [(0, 1), (0, 1), (0, 1)]

# Initial guess for the weights
initial_weights = [1/3, 1/3, 1/3]

# Optimize the weights
result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)

# Get the optimized weights
optimized_weights = result.x
print(f"Optimized Weights: {optimized_weights}")

# Calculate the weighted average of the predictions using optimized weights
y_pred_ensemble_optimized = (optimized_weights[0] * y_pred_logistic) + (optimized_weights[1] * y_pred_rf) + (optimized_weights[2] * y_pred)

# Calculate and print the AUC and accuracy score for the optimized ensemble model
auc_score_ensemble_optimized = roc_auc_score(y_test, y_pred_ensemble_optimized)
print(f"Optimized Ensemble Model AUC Score: {auc_score_ensemble_optimized:.2f}")

accuracy_ensemble_optimized = accuracy_score(y_test, (y_pred_ensemble_optimized > 0.5).astype(int))
print(f"Optimized Ensemble Model Accuracy Score: {accuracy_ensemble_optimized:.2f}")

# Predict default probability for each company using the optimized ensemble model
default_probabilities_ensemble_optimized = (optimized_weights[0] * default_probabilities_logistic) + (optimized_weights[1] * default_probabilities_rf) + (optimized_weights[2] * default_probabilities)
df['优化融合模型违约率'] = default_probabilities_ensemble_optimized

# Display the DataFrame with predicted default rates
print(df[['企业代号', '优化融合模型违约率']])

## 添加客户流动损失与最高贷款额度项

In [12]:
# Save the DataFrame with the predicted default rates to a new Excel file
df.to_excel('./Data&Question/final_merged_data_with_predictions.xlsx', index=False)