In [3]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def process_data(file_path):
    df = pd.read_csv(file_path)
    
    if '价格' in df.columns:
        Y = df['价格']
    else:
        Y = None
    
    feature_cols = ['地理位置评分', '建筑面积', '层高', '层型', '建筑结构', '装修情况', '房屋用途', '房屋年限', '梯户比']
    X = df.filter(items=feature_cols)
    
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    X_categorical = pd.get_dummies(X[categorical_cols], drop_first=True)
    X_numerical = X[numerical_cols].fillna(X[numerical_cols].mean())
    
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X_numerical)
    X_numerical_scaled = pd.DataFrame(X_numerical_scaled, columns=numerical_cols)
    
    X_processed = pd.concat([X_numerical_scaled, X_categorical], axis=1)
    
    return X_processed, Y, scaler, categorical_cols

def train_and_predict(train_file, test_file, output_file):
    X, Y, scaler, categorical_cols = process_data(train_file)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=111)
    
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    
    model = sm.OLS(Y_train, X_train).fit()
    
    Y_train_pred = model.predict(X_train)
    in_sample_mse = mean_squared_error(Y_train, Y_train_pred)
    
    Y_test_pred = model.predict(X_test)
    out_sample_mse = mean_squared_error(Y_test, Y_test_pred)
    
    print(f"OLS 回归评估:")
    print(f"  In-sample MSE: {in_sample_mse:.4f}")
    print(f"  Out-of-sample MSE: {out_sample_mse:.4f}")
    print(model.summary())
    
    # 处理测试集并预测
    X_test_data, _, _, _ = process_data(test_file)
    X_test_data = X_test_data.reindex(columns=X.columns, fill_value=0)
    X_test_data = sm.add_constant(X_test_data)
    predictions = model.predict(X_test_data)
    
    df_test = pd.read_csv(test_file)
    df_test['预测价格'] = predictions
    df_test.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"预测结果已保存: {output_file}")

# 直接处理一个训练集和一个测试集
train_file = "一次分类数据/merged_results.csv"
test_file = "一次测试数据/merged_results.csv"
output_file = "D:/RUCer/大三下/ai_python/2022202777/期中/预测结果/预测结果OLS.csv"

train_and_predict(train_file, test_file, output_file)


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).