In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import os
plt.rcParams['font.family'] = 'Times New Roman'

In [3]:
# Ensure output directory exists
output_dir = r"E:\jupyternoteBookWorkPath\erp\house_eco_poi1"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "analysis_results.txt")

# Open text file to save results
with open(output_file, 'w', encoding='utf-8') as f:
    try:
        df = pd.read_csv('shanghai_house_price_9_with_macro_3mavg.csv')
    except FileNotFoundError:
        f.write("Error: CSV file 'shanghai_house_price_9_with_macro_3mavg.csv' not found. Please ensure the file is in the correct directory.\n")
        raise

    # Feature groups
    physical_features = ['bedroom_num', 'livingroom_num', 'area', 'finish_level', 'age']
    neighborhood_features = ['floor_area_ratio', 'green_space_ratio', 
                           'dist_to_nearest_subway_station_m', 'dist_to_bank_m',
                           'dist_to_primary_school_m', 'dist_to_middle_school_m',
                           'dist_to_shopping_center_m', 'dist_to_top_tier_hospital_m',
                           'dist_to_scenic_spot_m']
    macro_features = ['gdp_growth_yoy_3m_avg', 'gdp_growth_qoq_3m_avg', 
                     'cpi_yoy_3m_avg', 'cpi_mom_3m_avg', 
                     'ppi_yoy_3m_avg', 'ppi_mom_3m_avg']
    
    # Target variable
    y = df['unit_price']
    
    # 4.1 EDA
    f.write("\n=== 4.1 Exploratory Data Analysis ===\n")
    f.write("Summary Statistics:\n")
    f.write(str(df[physical_features + neighborhood_features + macro_features + ['unit_price']].describe()))
    
    # Histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(df['unit_price'], bins=30)
    plt.title('Distribution of Unit Price')
    plt.savefig(os.path.join(output_dir, 'unit_price_histogram.png'))
    plt.close()
    f.write("\nHistogram of unit price saved as 'unit_price_histogram.png'")
    
    # Correlation heatmap
    plt.figure(figsize=(27, 21))
    correlation_matrix = df[physical_features + neighborhood_features + macro_features + ['unit_price']].corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
    plt.title('Correlation Heatmap', fontsize = 28)
    plt.savefig(os.path.join(output_dir, 'correlation_heatmap.png'), dpi = 300)
    plt.close()
    f.write("\nCorrelation heatmap saved as 'correlation_heatmap.png'")
    
    # 3. Methodology:Three scenarios
    scenarios = {
        'Physical Only': physical_features,
        'Physical + Neighborhood': physical_features + neighborhood_features,
        'Physical + Neighborhood + Macro': physical_features + neighborhood_features + macro_features
    }
    
    results = {}
    
    # 3.2 Method:Model configuration
    params = {
        'n_estimators': 100,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'eval_metric': 'rmse'  # Moved eval_metric to model initialization
    }
    
    f.write("\n=== 3.2 Model Configuration ===\n")
    f.write("Hyperparameters used:\n")
    f.write(f"n_estimators: {params['n_estimators']} - Number of boosting rounds\n")
    f.write(f"max_depth: {params['max_depth']} - Maximum tree depth\n")
    f.write(f"learning_rate: {params['learning_rate']} - Step size shrinkage\n")
    f.write(f"subsample: {params['subsample']} - Fraction of samples used per tree\n")
    f.write(f"colsample_bytree: {params['colsample_bytree']} - Fraction of features used per tree\n")
    f.write(f"eval_metric: {params['eval_metric']} - Evaluation metric for validation\n")
    
    # Training and evaluation for each scenario
    for scenario_name, features in scenarios.items():
        f.write(f"\n=== 4.2-4.6 Results for {scenario_name} ===\n")
        
        # Prepare data
        X = df[features]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train model
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train, 
                 eval_set=[(X_train, y_train), (X_test, y_test)],
                 verbose=False)
        
        # 4.2 Model Training Outcomes
        evals_result = model.evals_result()
        n_iterations = len(evals_result['validation_0']['rmse'])
        final_train_rmse = evals_result['validation_0']['rmse'][-1]
        final_test_rmse = evals_result['validation_1']['rmse'][-1]
        
        f.write(f"Number of iterations: {n_iterations}\n")
        f.write(f"Final training RMSE: {final_train_rmse:.2f}\n")
        f.write(f"Final test RMSE: {final_test_rmse:.2f}\n")
        
        # Plot learning curves
        plt.figure(figsize=(10, 6))
        plt.plot(evals_result['validation_0']['rmse'], label='Training RMSE')
        plt.plot(evals_result['validation_1']['rmse'], label='Test RMSE')
        plt.title(f'Learning Curves - {scenario_name}')
        plt.xlabel('Iteration')
        plt.ylabel('RMSE')
        plt.legend()
        plt.savefig(os.path.join(output_dir, f'learning_curve_{scenario_name.replace(" ", "_").lower()}.png'))
        plt.close()
        f.write(f"Learning curve saved as 'learning_curve_{scenario_name.replace(' ', '_').lower()}.png'\n")
        
        # 4.3 Prediction Results
        y_pred = model.predict(X_test)
        
        # 4.4 Error Analysis
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        f.write("\nEvaluation Metrics:\n")
        f.write(f"RMSE: {rmse:.2f}\n")
        f.write(f"MAE: {mae:.2f}\n")
        f.write(f"R²: {r2:.4f}\n")
        
        results[scenario_name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
        
        # 4.5 Feature Importance and Interpretability
        plt.figure(figsize=(30, 18))
        xgb.plot_importance(model, importance_type='gain', max_num_features=10)
        plt.title(f'Feature Importance (Gain) - {scenario_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'feature_importance_gain_{scenario_name.replace(" ", "_").lower()}.png'))
        plt.close()
        f.write(f"Feature importance (gain) plot saved as 'feature_importance_gain_{scenario_name.replace(' ', '_').lower()}.png'\n")
        
        # SHAP values
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values, X_test, show=False)
        plt.savefig(os.path.join(output_dir, f'shap_summary_{scenario_name.replace(" ", "_").lower()}.png'))
        plt.close()
        f.write(f"SHAP summary plot saved as 'shap_summary_{scenario_name.replace(' ', '_').lower()}.png'\n")
        
        # SHAP interaction effects
        shap_interaction_values = explainer.shap_interaction_values(X_test)
        plt.figure(figsize=(80,24))
        shap.summary_plot(shap_interaction_values, X_test, show=False)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'shap_interaction_{scenario_name.replace(" ", "_").lower()}.png'),dpi = 300, bbox_inches = 'tight')
        plt.close()
        f.write(f"SHAP interaction plot saved as 'shap_interaction_{scenario_name.replace(' ', '_').lower()}.png'\n")
    
    # 4.6 ablation study
    f.write("\n=== 4.6 Ablation Study Results ===\n")
    f.write("Scenario Comparison Table:\n")
    f.write("Scenario\tRMSE\tMAE\tR²\n")
    for scenario_name, metrics in results.items():
        f.write(f"{scenario_name}\t{metrics['RMSE']:.2f}\t{metrics['MAE']:.2f}\t{metrics['R2']:.4f}\n")

print(f"Analysis complete. Results saved to {output_file}")


Analysis complete. Results saved to E:\jupyternoteBookWorkPath\erp\house_eco_poi1\analysis_results.txt


<Figure size 3000x1800 with 0 Axes>

<Figure size 8000x2400 with 0 Axes>

<Figure size 3000x1800 with 0 Axes>

<Figure size 8000x2400 with 0 Axes>

<Figure size 3000x1800 with 0 Axes>

<Figure size 8000x2400 with 0 Axes>