In [17]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

In [18]:
#读入数据
url = "/home/mw/input/quant4533"

df_housing = pd.read_csv(url+"/ruc_Class25Q1_details.csv")
df_rent = pd.read_csv(url+"/ruc_Class25Q1_rent.csv")
df_train = pd.read_csv(url+"/ruc_Class25Q1_train.csv")
df_test = pd.read_csv(url+"/ruc_Class25Q1_test.csv")

In [3]:
missing_values = df_train.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

环线      41407
房屋户型      605
套内面积    58987
建筑结构      605
装修情况      605
梯户比例     1695
配备电梯     8315
别墅类型    83384
上次交易    28953
房屋用途        2
房屋年限    29782
抵押信息    84133
房屋优势    16064
核心卖点    16366
户型介绍    63671
周边配套    34027
交通出行    32437
dtype: int64


In [75]:
df_train.columns

Index(['城市', '区域', '板块', '环线', '小区名称', '价格', '房屋户型', '所在楼层', '建筑面积', '套内面积',
       '房屋朝向', '建筑结构', '装修情况', '梯户比例', '配备电梯', '别墅类型', '交易时间', '交易权属', '上次交易',
       '房屋用途', '房屋年限', '产权所属', '抵押信息', '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行',
       'lon', 'lat', '年份'],
      dtype='object')

In [206]:
df_test["区域"].value_counts()

45.0    493
93.0    483
62.0    399
5.0     373
25.0    339
       ... 
33.0      3
10.0      2
8.0       2
16.0      2
63.0      1
Name: 区域, Length: 93, dtype: int64

In [33]:
df_train['城市'].unique()

array([0, 1, 2, 3, 4, 5, 6])

In [19]:
#中文转数字函数
def chinese_to_number(chinese_num):
    if pd.isna(chinese_num) == True: return 0
    digit_map = {
        '零':0, '一':1, '二':2, '三':3, '四':4,
        '五':5, '六':6, '七':7, '八':8, '九':9,
        '十':10, '百':100, '两':2
    }
    if chinese_num in digit_map:
        return digit_map[chinese_num]
    if '十' in chinese_num:
        parts = re.split(r'十', chinese_num)
        if len(parts) == 1:  
            return 10
        elif parts[0] == '': 
            return 10 + digit_map.get(parts[1], 0)
        else:  
            ten = digit_map.get(parts[0], 0) * 10
            unit = digit_map.get(parts[1], 0) if parts[1] else 0
            return ten + unit
    return 0  


In [20]:
#封装中文处理函数
def trans(df):
    df["城市信息"] = df["城市"]
    df = pd.get_dummies(df, columns=['城市'], drop_first=True)
    df["周边配套"] = df["周边配套"].fillna("")
    df["周边配套"] = df["周边配套"].apply(lambda x: re.sub(r"[，、。；：]", ",", x))  # 统一用逗号分隔
    df["周边配套"] = df["周边配套"].apply(lambda x: re.sub(r"\s+", "", x))  # 去除多余空格
    categories = ["医院", "公园", "超市", "商场", "银行", "学校", "地铁", "公交"]
    for cat in categories:
        df[cat] = df["周边配套"].apply(lambda x: 1 if cat in x else 0)
    df["交通出行"] = df["交通出行"].fillna("")
    df["交通出行"] = df["交通出行"].apply(lambda x: re.sub(r"[，、。；：]", ",", x))  # 统一用逗号分隔
    df["交通出行"] = df["交通出行"].apply(lambda x: re.sub(r"\s+", "", x))  # 去除多余空格
    categories = ["地铁", "公交", "高速", "高铁", "机场"]
    for cat in categories:
        df[cat] = df["交通出行"].apply(lambda x: 1 if cat in x else 0)
    df['上次交易'] = pd.to_datetime(df['上次交易'], errors='coerce')
    df['上次年份'] = df['上次交易'].dt.year
    df['交易时间'] = pd.to_datetime(df['交易时间'], errors='coerce')
    df['交易年份'] = df['交易时间'].dt.year
    df['交易时长'] = df['交易年份'] - df['上次年份']
    df["交易频率"] = df["交易时间"].map(df["交易时间"].value_counts())
    df[['梯',"户"]] = df["梯户比例"].str.extract(r'([\u4e00-\u9fa5]+)梯([\u4e00-\u9fa5]+)户')
    df['梯'] = df['梯'].apply(chinese_to_number)
    df['户'] = df['户'].apply(chinese_to_number)
    df['梯户比例'] = df['梯'] / df['户']
    df["朝南"] = df["房屋朝向"].str.contains("南", na=False)
    df["建筑面积"] = df["建筑面积"].str.extract(r'(\d+)㎡').astype(float)
    df["套内面积"] = df["套内面积"].str.extract(r'(\d+)').fillna(0).astype(float)
    df[["楼层分类", "总层数"]] = df["所在楼层"].str.extract(r'^(.*?)\s+\(共(\d+)层\)$')
    df["总层数"] = df["总层数"].astype(int)
    df[["室", "厅", "厨","卫"]] = df["房屋户型"].str.extract(r'(\d+)室?(\d+)厅?(\d+)厨?(\d+)卫?').fillna(0).astype(int)
    mappings = {
        '产权所属': {
            '共有': 1, '非共有': 0
        },
        '房屋年限': {
            '满五年': 0, '满两年': 2, '未满两年': 7
        },
        '房屋用途': {
            '车库': 0, '商业': 1, '商业办公类': 1, '写字楼': 1,
            '底商': 2, '商住两用': 2,
            '老公寓': 3, '平房': 3,
            '酒店式公寓': 4, '住宅式公寓': 4, '公寓/住宅': 4, 
            '公寓': 4, '公寓/公寓': 4, '公寓（住宅）': 4,
            '普通住宅': 5,
            '别墅': 6, '四合院': 6, '新式里弄': 6, '花园洋房': 6
        },
        '交易权属': {
            '使用权': 0, '集资房': 1, '拆迁还建房': 2,'动迁安置房': 2, '定向安置房': 2, '售后公房': 2,'安置房': 2,
            '经济适用房': 3, '限价商品房': 3, '自住型商品房': 3,'一类经济适用房': 3,'二类经济适用房': 3,
            '已购公房': 4, '房改房': 4, '央产房': 4,
            '自住型商品房': 5, '限价商品房': 5,
            '私产': 6, '商品房': 7
        },
        '别墅类型': {
            None: 0, '联排': 1, '叠拼': 2, '双拼': 3, '独栋': 4
        },
        '配备电梯': {
            '有': 1, '无': 0
        },
        '装修情况': {
            '精装': 3, '简装': 2, '毛坯': 1, '其他': 0
        },
        '建筑结构': {
            '混合结构': 3, '钢混结构': 6, '砖混结构': 2, '钢结构': 5,
            '未知结构': 0, '砖木结构': 1, '框架结构': 4
        },
        '楼层分类': {
            '高楼层': 4, '中楼层': 3, '低楼层': 2, '顶层': 5,
            '底层': 1, '地下室': 0
        },
        '环线': {
            '内环内': 1, '一环内': 1, '二环内': 2, '外环外': 5,
            '四环外': 5, '三环外': 4, '六环外': 7,
            '一至二环': 1.5, '二至三环': 2.5, '三至四环': 3.5,
            '四至五环': 4.5, '五至六环': 5.5,
            '内环至外环': 3, '内环至中环': 2, '中环至外环': 4
        }
    }
    df['建筑面积正常'] = (df['建筑面积'] >= 100) & (df['建筑面积'] <= 600)  
    df['建筑面积交叉'] = df['建筑面积正常'] * df['建筑面积']
    df['lat2'] = df['lat'] ** 2    
    df['lon2'] = df['lon'] ** 2
    df['time2'] = (df['交易年份'] - 2000) ** 2
    for column, mapping in mappings.items():
        if column in df.columns:
            df[column] = df[column].replace(mapping) 
    return df


In [21]:
df_test = trans(df_test)
df_train = trans(df_train)

In [22]:
df_train["price1"] = np.log(df_train["价格"])

In [23]:
df_filtered = df_train[(df_train["建筑面积"] > 100) & (df_train["建筑面积"] < 500)]
X = df_filtered[["建筑面积"]]
y = df_filtered["price1"]
model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]
intercept = model.intercept_
print(f"回归方程: 建筑面积 = {slope:.2f} * 房价对数 + {intercept:.2f}")
print(f"R²值: {model.score(X, y):.3f}")

df_filtered.plot(kind='scatter', grid=True,x="建筑面积", y="price1")

回归方程: 建筑面积 = 0.01 * 房价对数 + 13.15
R²值: 0.305


<AxesSubplot:xlabel='建筑面积', ylabel='price1'>

In [21]:
df_train['价格'].hist(bins=30)  
plt.title('数值分布直方图')
plt.xlabel('值区间')
plt.ylabel('频数')
plt.show()

In [24]:
# 使用 quantile(0.75) 计算 75 分位数
huanxian = []
for i in range(7):
    huanxian.append(df_train.loc[df_train["城市信息"].astype(int) == i, "环线"].quantile(0.75))
huanxian[1] = 0
huanxian[5] = 0
ti_25 = df_train["梯"].quantile(0.25)
hu_75 = df_train["户"].quantile(0.75)
furni_median = df_train["装修情况"].median()
transtime_median = df_train["交易时长"].median()
lift_mean = df_train["配备电梯"].mean()
years_mean = df_train["房屋年限"].mean()
# 获取众数
shi_mode = df_train["室"].mode()[0]
ting_mode = df_train["厅"].mode()[0]
chu_mode = df_train["厨"].mode()[0]
wei_mode = df_train["卫"].mode()[0]
struc_mode = df_train["建筑结构"].mode()[0]
use_mode = df_train["房屋用途"].mode()[0]
print(use_mode)


5.0


In [25]:
#补充缺失值
def zym_fillna(df):
    for i in range(7):
        df.loc[(df["城市信息"] == i) & (df["环线"].isna()), "环线"] = huanxian[i]
    df.loc[df["房屋户型"].isna(), "室"] = shi_mode
    df.loc[df["房屋户型"].isna(), "厅"] = ting_mode
    df.loc[df["房屋户型"].isna(), "厨"] = chu_mode
    df.loc[df["房屋户型"].isna(), "卫"] = wei_mode
    df.loc[df["梯户比例"].isna(), "梯"] = ti_25
    df.loc[df["梯户比例"].isna(), "户"] = hu_75
    df.loc[df["梯户比例"].isna(), "梯户比例"] = ti_25 / hu_75
    
    df.loc[df["套内面积"] == 0, "套内面积"] = df.apply(
        lambda row: 0.8 * row["建筑面积"] + 0.38 if row["建筑面积"] > 100 else 80.65,
        axis=1
    )
    df['建筑结构'] = df['建筑结构'].fillna(struc_mode)
    df['房屋用途'] = df['房屋用途'].fillna(use_mode)
    df['装修情况'] = df['装修情况'].fillna(furni_median)
    df['交易时长'] = df['交易时长'].fillna(transtime_median)
    df['上次年份'] = df['上次年份'].fillna(df["交易年份"] - transtime_median)
    df['配备电梯'] = df['配备电梯'].fillna(lift_mean)
    df['房屋年限'] = df['房屋年限'].fillna(years_mean)
    df['别墅类型'] = df['别墅类型'].fillna(0)
    df['城市1交叉'] = df['城市_1'] * df['环线']
    df['城市2交叉'] = df['城市_2'] * df['环线']
    df['城市3交叉'] = df['城市_3'] * df['环线']
    df['城市4交叉'] = df['城市_4'] * df['环线']  
    df['城市5交叉'] = df['城市_5'] * df['环线']  
    df['城市6交叉'] = df['城市_6'] * df['环线']
    df['城市1交叉1'] = df['城市_1'] * df['lon2']
    df['城市2交叉1'] = df['城市_2'] * df['lon2']
    df['城市3交叉1'] = df['城市_3'] * df['lon2']
    df['城市4交叉1'] = df['城市_4'] * df['lon2']  
    df['城市5交叉1'] = df['城市_5'] * df['lon2']  
    df['城市6交叉1'] = df['城市_6'] * df['lon2']
    df['城市1交叉2'] = df['城市_1'] * df['lat2']
    df['城市2交叉2'] = df['城市_2'] * df['lat2']
    df['城市3交叉2'] = df['城市_3'] * df['lat2']
    df['城市4交叉2'] = df['城市_4'] * df['lat2']  
    df['城市5交叉2'] = df['城市_5'] * df['lat2']  
    df['城市6交叉2'] = df['城市_6'] * df['lat2']
    return df


In [26]:
df_train = zym_fillna(df_train)
df_test = zym_fillna(df_test)

In [27]:
cols_to_drop = ["小区名称","抵押信息", "房屋优势", "核心卖点", "户型介绍", "周边配套", "交通出行","房屋朝向","所在楼层","房屋户型","上次交易","交易时间"]
df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [28]:
# 特征列
feature_columns = [
    '区域', '板块', '环线', '建筑面积', '套内面积', '建筑结构', '装修情况', '梯户比例',
    '配备电梯', '别墅类型', '交易权属', '房屋用途', '房屋年限', '产权所属', 'lon', 'lat', '年份',
    '上次年份', '交易年份', '交易时长', '交易频率', '梯', '户', '朝南', '楼层分类', '总层数', '室', '厅',
    '厨', '卫','建筑面积正常','建筑面积交叉',"医院", "公园", "超市", "商场", "银行", "学校", "地铁", "公交",
    "地铁", "公交", "高速", "高铁", "机场",'城市_1','城市_2', '城市_3', '城市_4', '城市_5', '城市_6',
    '城市1交叉','城市2交叉','城市3交叉','城市4交叉','城市5交叉','城市6交叉','lat2','lon2','time2',
    '城市1交叉1','城市2交叉1','城市3交叉1','城市4交叉1','城市5交叉1','城市6交叉1',
    '城市1交叉2','城市2交叉2','城市3交叉2','城市4交叉2','城市5交叉2','城市6交叉2'
]

target_column = 'price1'

In [29]:
X = df_train[feature_columns]
y = df_train[target_column]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

In [48]:
print("训练集特征形状:", X_train.shape)
print("测试集特征形状:", X_test.shape)
print("训练集目标形状:", y_train.shape)
print("测试集目标形状:", y_test.shape)

训练集特征形状: (67306, 60)
测试集特征形状: (16827, 60)
训练集目标形状: (67306,)
测试集目标形状: (16827,)


In [31]:
def exp_mae(y_true, y_pred):
    """计算还原后的 MAE"""
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # 样本内性能
    y_train_pred = model.predict(X_train)
    train_mae = mean_absolute_error(np.exp(y_train), np.exp(y_train_pred))
    # 样本外性能
    y_test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(np.exp(y_test), np.exp(y_test_pred))
    # 6折交叉验证
    kf = KFold(n_splits=6, shuffle=True, random_state=42)
    scorer = make_scorer(exp_mae, greater_is_better=False)  # 自定义评分器
    cv_scores = -cross_val_score(model, X_train, y_train, cv=kf, scoring=scorer)  # 取负号转换回正的 MAE
    cv_mae = np.mean(cv_scores)
    #kf = KFold(n_splits=6, shuffle=True, random_state=42)
    #cv_scores = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
    #cv_mae = np.mean(cv_scores)
    print("OK!")
    return {
        'Model': model_name, 
        'In-Sample MAE': train_mae,
        'Out-Sample MAE': test_mae,
        '6-Fold CV MAE': cv_mae
    }

In [32]:
# 初始化模型
models = {
    'OLS': LinearRegression(),
    #'LASSO': Lasso(alpha=0.1, tol=1e-4, max_iter=100000),
    'Ridge': Ridge(alpha=1),
    #'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=100000)
}

# 训练并评估
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    results.append(evaluate_model(model, X_train, y_train, X_test, y_test, name))

OK!
OK!


In [34]:
results_df = pd.DataFrame(results)

best_model_name = results_df.loc[results_df['6-Fold CV MAE'].idxmin(), 'Model']
best_model = models[best_model_name]

In [35]:
print(results_df)

   Model  In-Sample MAE  Out-Sample MAE  6-Fold CV MAE
0    OLS  515362.735139    7.849970e+06   9.247713e+05
1  Ridge  566483.226060    1.825192e+07   1.436380e+06


In [127]:
X_real_test = df_test[feature_columns]
test_predictions = models["Ridge"].predict(X_real_test)
test_results = pd.DataFrame({
    "ID": df_test["ID"].values,
    "Price": np.exp(test_predictions)
})

In [37]:
X_real_test = df_test[feature_columns]
test_predictions = best_model.predict(X_real_test)
test_results = pd.DataFrame({
    "ID": df_test["ID"].values,
    "Price": np.exp(test_predictions)
})
test_results.to_csv("test_predictions.csv", index=False)