In [76]:
! python train.py


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(importance_dict.values()),
Figure(1000x600)
Figure(600x600)
Training accuracy: 1.0000
Testing accuracy: 0.9934
Training Macro-F1: 1.0000
Testing Macro-F1: 0.9929
Model saved to random_forest_model.pkl


In [48]:
! python predict.py

Predictions saved to submission.csv


In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os

def save_feature_stats(df, features, filename):
    """保存特征的统计信息到CSV文件"""
    stats = pd.DataFrame({
        'mean': df[features].mean(),
        'min': df[features].min(),
        'max': df[features].max(),
        'std': df[features].std()
    })
    stats.to_csv(filename)
    print(f"特征统计信息已保存到 {filename}")

def plot_feature_heatmap(df, features, title, filename):
    """绘制特征热力图并保存"""
    plt.figure(figsize=(12, 10))
    corr = df[features].corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', 
                center=0, linewidths=.5, cbar_kws={"shrink": .8})
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename)
    print(f"热力图已保存到 {filename}")
    plt.close()

def preprocess_data(filepath):
    # 创建输出目录
    os.makedirs('feature_analysis', exist_ok=True)
    
    # 1. 加载数据
    df = pd.read_csv(filepath)
    
    # 2. 空值处理
    df['bank_asset_value'] = df['bank_asset_value'].replace(-100000, np.nan)
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].isnull().sum() > 0:
            print("有空值")
            df[col].fillna(df[col].median(), inplace=True)
    
    # 3. 第一次特征工程
    df['education'] = df['education'].map({'Graduate': 1, 'Not Graduate': 0})
    df['self_employed'] = df['self_employed'].map({'Yes': 1, 'No': 0})
    df['debt_to_income'] = df['loan_amount'] / df['income_annum']
    df['total_assets'] = df[['residential_assets_value', 'commercial_assets_value', 
                            'luxury_assets_value', 'bank_asset_value']].sum(axis=1)
    
    # 第一次特征工程后的特征
    features_1 = ['no_of_dependents', 'education', 'self_employed', 'income_annum',
                 'loan_amount', 'loan_term', 'cibil_score', 'total_assets',
                 'debt_to_income']
    plot_feature_heatmap(df, features_1, '第一次特征工程后相关性', 'heatmap_before_scale.png')    
    # 输出第一次特征工程后的热力图和统计信息
    plot_feature_heatmap(df, features_1, '第一次特征工程后特征相关性', 
                        'feature_analysis/heatmap_after_first_fe.png')
    save_feature_stats(df, features_1, 'feature_analysis/stats_after_first_fe.csv')
    
    # 4. 第二次特征工程（添加更多特征）
    df['loan_to_assets'] = df['loan_amount'] / (df['total_assets'] + 1)  # 加1防止除零
    df['income_per_dependent'] = df['income_annum'] / (df['no_of_dependents'] + 1)
    df['asset_to_income'] = df['total_assets'] / df['income_annum']
    df['loan_term_x_loan_amount'] = df['loan_term'] * df['loan_amount']
    
    # 第二次特征工程后的特征
    features_2 = features_1 + ['loan_to_assets', 'income_per_dependent', 
                              'asset_to_income', 'loan_term_x_loan_amount']
    
    # 输出第二次特征工程后的热力图和统计信息
    plot_feature_heatmap(df, features_2, '第二次特征工程后特征相关性', 
                        'feature_analysis/heatmap_after_second_fe.png')
    save_feature_stats(df, features_2, 'feature_analysis/stats_after_second_fe.csv')
    
    # 5. 特征缩放
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features_2])
    X_scaled = pd.DataFrame(X_scaled, columns=features_2)
    
    # 6. 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, df['label'], test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, scaler

# 其余代码保持不变...
preprocess_data('train.csv')

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)


热力图已保存到 heatmap_before_scale.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)


热力图已保存到 feature_analysis/heatmap_after_first_fe.png
特征统计信息已保存到 feature_analysis/stats_after_first_fe.csv


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)
  plt.savefig(filename)


热力图已保存到 feature_analysis/heatmap_after_second_fe.png
特征统计信息已保存到 feature_analysis/stats_after_second_fe.csv


(      no_of_dependents  education  self_employed  income_annum  loan_amount  \
 1398             -1.48      -1.01           1.00         -1.10        -0.93   
 693              -0.89      -1.01          -1.00          0.04        -0.53   
 1089              0.88      -1.01           1.00          0.22        -0.15   
 1837              0.29       0.99           1.00          1.05         1.77   
 2874             -0.89      -1.01           1.00         -0.10        -0.40   
 ...                ...        ...            ...           ...          ...   
 1638              1.47      -1.01          -1.00         -0.67        -0.57   
 1095             -0.89       0.99          -1.00          1.01         1.16   
 1130              1.47      -1.01           1.00         -0.17        -0.08   
 1294              1.47       0.99           1.00         -0.03         0.13   
 860              -0.30      -1.01          -1.00         -1.14        -0.97   
 
       loan_term  cibil_score  total_a

In [67]:
# 新增可视化库导入
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

def train_and_save_model(filepath='train.csv', model_path='random_forest_model.pkl'):
    X_train, X_test, y_train, y_test, scaler = preprocess_data(filepath)
    
    rf = RandomForest(n_trees=100, max_depth=10, min_samples_split=2)
    rf.fit(X_train.values, y_train.values)
    
    # 1. 特征重要性可视化（新增）
    def plot_feature_importance(model, feature_names):
        """自定义特征重要性计算（基于信息增益总和）"""
        importance = {name: 0 for name in feature_names}
        for tree in model.trees:
            _accumulate_importance(tree.root, importance, feature_names)
        
        # 归一化
        total = sum(importance.values())
        importance = {k: v/total for k,v in importance.items()}
        
        plt.figure(figsize=(10,6))
        sns.barplot(x=list(importance.values()), 
                    y=list(importance.keys()),
                    palette='viridis')
        plt.title('Feature Importance (Custom Calculation)')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.show()
    
    def _accumulate_importance(node, importance_dict, feature_names):
        if node.is_leaf():
            return
        feature_name = feature_names[node.feature]
        importance_dict[feature_name] += 1  # 简单计数分裂次数
        _accumulate_importance(node.left, importance_dict, feature_names)
        _accumulate_importance(node.right, importance_dict, feature_names)
    
    plot_feature_importance(rf, X_train.columns.tolist())
    
    # 2. 混淆矩阵可视化（新增）
    def plot_confusion_matrix(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(6,6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Reject', 'Approve'],
                   yticklabels=['Reject', 'Approve'])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig('confusion_matrix.png')
        plt.show()
    
    y_test_pred = rf.predict(X_test.values)
    plot_confusion_matrix(y_test, y_test_pred)
    
    # 3. 模型性能对比（新增）
    def plot_metrics(train_acc, test_acc, train_f1, test_f1):
        metrics = ['Accuracy', 'F1-Score']
        train_scores = [train_acc, train_f1]
        test_scores = [test_acc, test_f1]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        fig, ax = plt.subplots(figsize=(8,5))
        rects1 = ax.bar(x - width/2, train_scores, width, label='Train', color='#3498db')
        rects2 = ax.bar(x + width/2, test_scores, width, label='Test', color='#2ecc71')
        
        ax.set_ylabel('Scores')
        ax.set_title('Model Performance Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(metrics)
        ax.legend()
        ax.set_ylim(0, 1.05)
        
        def autolabel(rects):
            for rect in rects:
                height = rect.get_height()
                ax.annotate(f'{height:.2f}',
                            xy=(rect.get_x() + rect.get_width() / 2, height),
                            xytext=(0, 3),
                            textcoords="offset points",
                            ha='center', va='bottom')
        
        autolabel(rects1)
        autolabel(rects2)
        
        plt.tight_layout()
        plt.savefig('performance_comparison.png')
        plt.show()
    
    # 计算预测结果
    y_train_pred = rf.predict(X_train.values)
    y_test_pred = rf.predict(X_test.values)
    
    train_acc = np.mean(y_train_pred == y_train.values)
    test_acc = np.mean(y_test_pred == y_test.values)
    
    train_f1 = f1_score(y_train.values, y_train_pred, average='macro')
    test_f1 = f1_score(y_test.values, y_test_pred, average='macro')
    
    plot_metrics(train_acc, test_acc, train_f1, test_f1)
    
    # 保存模型（原功能保持不变）
    joblib.dump({'model': rf, 'scaler': scaler}, model_path)
    print(f"Model saved to {model_path}")
    
    return rf, scaler

In [None]:
! pyth