# Kaggle房价预测比赛

本notebook实现对Kaggle房价预测比赛的解决方案。

In [2]:
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'seaborn'

## 1. 数据加载

In [None]:
# 加载数据
train_data = pd.read_csv('d:/vscode_program/data/kaggle_house_pred_train.csv')
test_data = pd.read_csv('d:/vscode_program/data/kaggle_house_pred_test.csv')

# 查看训练数据的基本信息
print(f"训练数据形状: {train_data.shape}")
print(f"测试数据形状: {test_data.shape}")
train_data.head()

NameError: name 'pd' is not defined

In [None]:
# 查看数据的基本统计信息
train_data.describe()

In [None]:
# 检查缺失值
train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

print("训练数据缺失值:")
print(train_missing[train_missing > 0])
print("\n测试数据缺失值:")
print(test_missing[test_missing > 0])

## 2. 数据预处理

In [None]:
# 合并数据集以便统一处理
n_train = train_data.shape[0]
all_data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

# 删除ID列
if 'Id' in all_data.columns:
    all_data.drop('Id', axis=1, inplace=True)

In [None]:
# 处理缺失值
# 对于数值型特征，使用中位数填充
numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns
for feature in numeric_features:
    if all_data[feature].isnull().sum() > 0:
        all_data[feature].fillna(all_data[feature].median(), inplace=True)

# 对于类别型特征，使用众数填充
categorical_features = all_data.select_dtypes(include=['object']).columns
for feature in categorical_features:
    if all_data[feature].isnull().sum() > 0:
        all_data[feature].fillna(all_data[feature].mode()[0], inplace=True)

In [None]:
# 处理类别型特征 - 使用独热编码
all_data = pd.get_dummies(all_data, drop_first=True)

In [None]:
# 分离训练集和测试集
train_features = all_data[:n_train]
test_features = all_data[n_train:]

# 获取目标变量
train_labels = train_data['SalePrice']

# 对目标变量进行对数变换（使分布更接近正态分布）
train_labels = np.log1p(train_labels)

## 3. 模型训练与评估

In [None]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [None]:
# 定义评估函数
def rmse_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5))
    return rmse

In [None]:
# 训练多个模型并评估
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=10),
    'Lasso': Lasso(alpha=0.001),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    cv_rmse = rmse_cv(model, train_features, train_labels).mean()
    results[name] = {'Validation RMSE': val_rmse, 'CV RMSE': cv_rmse}
    print(f"{name}: Validation RMSE = {val_rmse:.4f}, CV RMSE = {cv_rmse:.4f}")

In [None]:
# 可视化模型性能比较
cv_results = pd.DataFrame({name: result['CV RMSE'] for name, result in results.items()}, index=['CV RMSE']).T
cv_results.sort_values('CV RMSE').plot(kind='bar', figsize=(12, 6))
plt.title('模型性能比较 (CV RMSE)')
plt.ylabel('RMSE')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. 模型融合

In [None]:
# 选择表现最好的几个模型进行融合
best_models = {
    'XGBoost': xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, random_state=42)
}

# 训练模型
for name, model in best_models.items():
    model.fit(train_features, train_labels)
    print(f"{name} 模型训练完成")

In [None]:
# 对测试集进行预测
predictions = {}
for name, model in best_models.items():
    predictions[name] = model.predict(test_features)
    
# 融合预测结果（简单平均）
ensemble_pred = np.mean([predictions[name] for name in best_models.keys()], axis=0)

## 5. 生成提交文件

In [None]:
# 将对数预测转换回原始值
final_predictions = np.expm1(ensemble_pred)

# 创建提交文件
submission = pd.DataFrame({
    'Id': test_data['Id'] if 'Id' in test_data.columns else range(1, len(test_data) + 1),
    'SalePrice': final_predictions
})

# 保存提交文件
submission_path = 'd:/vscode_program/House_price/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"提交文件已保存至: {submission_path}")
submission.head()