In [29]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

In [30]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

# 导包+数据简单处理

In [2]:
pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import re
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel, RFE
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
train = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
predict = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')

# 合并处理后的detail数据

In [5]:
fullData = pd.concat([train,predict],axis=0)
xiaoqu = pd.read_csv("/home/mw/project/xiaoqu_rent.csv")

In [6]:
is_matching = fullData['小区名称'].isin(xiaoqu['名称'])

# 获取匹配的索引
matching_indexes = fullData.index[is_matching]

not_matching = ~fullData['小区名称'].isin(xiaoqu['名称'])
# 获取不匹配的索引
unmatching_indexes = fullData.index[not_matching]

# 获取不匹配的小区名称
unmatched_names = fullData.loc[not_matching, '小区名称']

In [7]:
#按照最近的经纬度匹配小区
def haversine(lon1, lat1, lon2, lat2):
    """
    计算两个经纬度点之间的距离（单位：千米）
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [8]:
# 对于匹配的索引的行，直接添加匹配的小区名称
fullData.loc[matching_indexes, '匹配小区'] = fullData.loc[matching_indexes, '小区名称']

# 对于不匹配的索引的行，根据经纬度匹配最近的 xiaoqu 的名称
for index, row in fullData.loc[unmatching_indexes].iterrows():
    distances = haversine(row['lon'], row['lat'], xiaoqu['coord_x'], xiaoqu['coord_y'])
    nearest_index = np.argmin(distances)
    fullData.loc[index, '匹配小区'] = xiaoqu.loc[nearest_index, '名称']

In [10]:
columns_to_merge = ['建筑年代','房屋总数','楼栋总数','绿 化 率', '容 积 率','停车位','停车费用','环线位置','平均每平米月租金']
fullData = fullData.merge(xiaoqu[['名称'] + columns_to_merge],
                          left_on='匹配小区', right_on='名称', how='left')

#补充环线信息
fullData['环线'] = fullData['环线'].combine_first(fullData['环线'])
fullData['环线'].fillna("无", inplace=True)

# 删除多余的“名称”列
fullData = fullData.drop(columns=['名称', '环线位置'])
fullData.head()

Unnamed: 0,城市,区域,板块,环线,小区名称,价格,房屋户型,所在楼层,建筑面积,套内面积,...,ID,匹配小区,建筑年代,房屋总数,楼栋总数,绿 化 率,容 积 率,停车位,停车费用,平均每平米月租金
0,0,79.0,111.0,二至三环,人定湖西里,6564200.0,2室1厅1厨1卫,中楼层 (共5层),52.3㎡,,...,,人定湖西里,1982.0,1317,19,30.0,3.0,300.0,0.0,143.621275
1,0,43.0,231.0,五至六环,龙跃苑四区,4174000.0,3室1厅1厨1卫,顶层 (共6层),127.44㎡,123.7㎡,...,,龙跃苑四区,2005.0,2317,40,30.0,1.73,1550.0,150.0,65.334173
2,0,97.0,54.0,五至六环,名都园,16310000.0,4室2厅1厨4卫,底层 (共3层),228.54㎡,,...,,名都园,2002.0,1249,565,30.1,0.64,0.0,0.0,123.502865
3,0,62.0,568.0,三至四环,保利海德公园,2834600.0,2房间2卫,低楼层 (共10层),43.6㎡,29.39㎡,...,,保利海德公园,2015.0,577,12,40.0,2.6,950.0,1150.0,234.292104
4,0,62.0,226.0,三至四环,京投银泰琨御府,1954000.0,1房间1卫,中楼层 (共10层),39.85㎡,29.94㎡,...,,京投银泰琨御府,2010.0,1685,19,60.0,1.58,1800.0,1200.0,172.492516


In [11]:
df = fullData.loc[:84132]
predict_data = fullData.loc[84133:]

In [12]:
print("合并后行数:", len(predict_data))
train_df, test_df = train_test_split(df, test_size=0.2, random_state=111)

合并后行数: 14786


In [42]:
xiaoqu.info()

In [13]:
def preprocess_data(df, is_training=True,encoders=None):
    """
    对数据进行清洗和特征工程处理，返回处理后的 DataFrame。
    
    参数:
    df - 输入的DataFrame
    is_training - 布尔值，表示是否为训练集
    encoders - 字典，包含已训练好的编码器（如 OneHotEncoder）
    scalers - 字典，包含已训练好的scaler对象（用于测试集）
    median_area_by_room - Series，包含每个室数的建筑面积中位数（用于测试集）
    
    返回:
    处理后的DataFrame，以及训练过程中学习的转换器（如果是训练集）
    """
    df = df.copy()
    
    # 1. 删除重复值和空列
    df = df.drop_duplicates().dropna(axis=1, how='all')
    
    # 2. 删除缺失率高或不用于建模的列
    NA_cols = ['套内面积', '别墅类型']
    drop_cols = ['交易权属', 'lon', 'lat','年份', '环线位置', '小区地址', '物业类别', '上次交易', '房屋用途', '房屋年限', '产权所属']
    df.drop(columns=[col for col in NA_cols + drop_cols if col in df.columns], inplace=True)
    
    # 3. 处理交易时间
    if '交易时间' in df.columns:
        df['交易时间'] = pd.to_datetime(df['交易时间'], errors='coerce')
        df['交易年份'] = df['交易时间'].dt.year
        df.drop(columns=['交易时间'], inplace=True)
    
    # 4. 填充缺失值
    df['装修情况'] = df['装修情况'].fillna('其他')
    df['配备电梯'] = df['配备电梯'].fillna('无').map({'有': 1, '无': 0})
    df['建筑结构'] = df['建筑结构'].fillna('钢混结构')
    
    # 5. 分类变量编码
    if '建筑结构' in df.columns:
        df['建筑结构编码'] = df['建筑结构'].map({'混合结构': '混合结构', '钢混结构': '钢混结构'}).fillna('其他')
    
    if '装修情况' in df.columns:
        df['装修编码'] = df['装修情况'].map({'简装': '简装', '精装': '精装', '毛坯': '毛坯'}).fillna('其他')
    
    # 6. OneHot 编码
    onehot_cols = ['建筑结构编码', '装修编码']
    existing_cols = [col for col in onehot_cols if col in df.columns]
    
    if is_training:
        # 训练阶段：初始化并训练编码器
        encoders = {} if encoders is None else encoders
        encoders['onehot_encoder'] = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)
        
        if existing_cols:
            onehot_encoded = encoders['onehot_encoder'].fit_transform(df[existing_cols])
            onehot_cols_names = encoders['onehot_encoder'].get_feature_names_out(existing_cols)
            onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_cols_names, index=df.index)
            df = pd.concat([df.drop(existing_cols, axis=1), onehot_df], axis=1)
    else:
        # 测试阶段：使用训练好的编码器
        if existing_cols and encoders and 'onehot_encoder' in encoders:
            onehot_encoded = encoders['onehot_encoder'].transform(df[existing_cols])
            onehot_cols_names = encoders['onehot_encoder'].get_feature_names_out(existing_cols)
            onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_cols_names, index=df.index)
            df = pd.concat([df.drop(existing_cols, axis=1), onehot_df], axis=1)
    
    # 7. 删除原始列
    df.drop(columns=['建筑结构', '装修情况'], errors='ignore', inplace=True)
    
    if is_training:
        return df, encoders  # 返回处理后的数据和编码器
    else:
        return df  # 测试阶段只返回数据

In [14]:
train_df, encoders = preprocess_data(train_df, is_training=True)

# 4. 使用训练集学习到的转换器处理测试集
test_df = preprocess_data(test_df, is_training=False,encoders=encoders)

# 5. 当处理预测集时，也使用相同的转换器
predict_data = preprocess_data(predict_data, is_training=False, encoders=encoders)

# 建筑面积

In [15]:
def clean_area_column(df):
    """清洗‘建筑面积’列的非数字字符并转换为浮点数"""
    df['建筑面积'] = (
        df['建筑面积']
        .astype(str)
        .str.replace(r'[^0-9.]', '', regex=True)
        .astype(float)
    )
    return df

# 分别应用到每个数据集
train_df = clean_area_column(train_df)
test_df = clean_area_column(test_df)
predict_data = clean_area_column(predict_data)

# Step 2. 数据验证
# 检查转换结果
print("清洗后示例：\n", train_df['建筑面积'].sample(5))
print("\n异常值统计：")
print("零或负面积:", train_df[train_df['建筑面积'] <=0].shape[0])
print("缺失值比例:", train_df['建筑面积'].isnull().mean())
print("建筑面积描述统计：\n", train_df['建筑面积'].describe())

清洗后示例：
 17224    101.00
68541     60.13
18175     61.76
66255     66.00
49095     61.37
Name: 建筑面积, dtype: float64

异常值统计：
零或负面积: 0
缺失值比例: 0.0
建筑面积描述统计：
 count    67259.000000
mean        96.654295
std         66.199895
min         10.000000
25%         66.000000
50%         88.860000
75%        115.900000
max      10337.000000
Name: 建筑面积, dtype: float64


In [16]:
#计算99%分位数作为截断点
area_99_percentile = train_df['建筑面积'].quantile(0.99)

#截断异常值
train_df.loc[train_df['建筑面积'] > area_99_percentile, '建筑面积'] = area_99_percentile
test_df.loc[test_df['建筑面积'] > area_99_percentile, '建筑面积'] = area_99_percentile
predict_data.loc[predict_data['建筑面积'] > area_99_percentile, '建筑面积'] = area_99_percentile

# 创建管道：先标准化，再添加多项式特征
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])
    
# 仅选择建筑面积列
area_train = train_df[["建筑面积"]]
area_test = test_df[["建筑面积"]]
area_predict=predict_data[["建筑面积"]]
    
# 转换训练集
area_poly_train = pipeline.fit_transform(area_train)
# 使用训练集的均值和标准差转换测试集
area_poly_test = pipeline.transform(area_test)
area_poly_predict = pipeline.transform(area_predict)
    
# 将转换后的特征添加回原始数据框
train_df = train_df.copy()
train_df["area_scaled"] = area_poly_train[:, 0]
train_df["area_squared"] = area_poly_train[:, 1]
    
test_df = test_df.copy()
test_df["area_scaled"] = area_poly_test[:, 0]
test_df["area_squared"] = area_poly_test[:, 1]
    
predict_data = predict_data.copy()
predict_data["area_scaled"] = area_poly_predict[:, 0]
predict_data["area_squared"] = area_poly_predict[:, 1]

In [17]:
# 检查数据集
print("当前所有列名:", train_df.columns.tolist())

当前所有列名: ['城市', '区域', '板块', '环线', '小区名称', '价格', '房屋户型', '所在楼层', '建筑面积', '房屋朝向', '梯户比例', '配备电梯', '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行', '匹配小区', '建筑年代', '房屋总数', '楼栋总数', '绿 化 率', '容 积 率', '停车位', '停车费用', '平均每平米月租金', '交易年份', '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_毛坯', '装修编码_简装', '装修编码_精装', 'area_scaled', 'area_squared']


# 房屋朝向

In [18]:
#房屋朝向处理
def extract_main_direction(orientation):
    """
    规则优先级：南 > 北 > 东 > 西
    参数：
        orientation: 字符串（如"东南北"）或NaN
    返回：
        主要朝向（"南","北","东","西","其他"）
    """
    if pd.isna(orientation):
        return "其他"
    orientation = str(orientation).replace(" ", "")  # 清洗空格
    
    # 按优先级判断
    if "南" in orientation:
        return "南"
    elif "北" in orientation:
        return "北"
    elif "东" in orientation:
        return "东"
    elif "西" in orientation:
        return "西"
    else:
        return "其他"

# 训练集处理
train_df["主要朝向"] = train_df["房屋朝向"].apply(extract_main_direction)

# 验证集处理（必须使用相同处理方式）
test_df["主要朝向"] = test_df["房屋朝向"].apply(extract_main_direction)

# 预测集处理
predict_data["主要朝向"] = predict_data["房屋朝向"].apply(extract_main_direction)
        
# 初始化编码器（自动处理未知类别）
encoder = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)
encoder.fit(train_df[["主要朝向"]])  # 只在训练集上fit

# 训练集
train_encoded = encoder.transform(train_df[["主要朝向"]])
train_df = pd.concat([
    train_df.drop(["房屋朝向", "主要朝向"], axis=1),
    pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(["主要朝向"]), index=train_df.index)
], axis=1)

# 验证集
test_encoded = encoder.transform(test_df[["主要朝向"]])
test_df = pd.concat([
    test_df.drop(["房屋朝向", "主要朝向"], axis=1),
    pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(["主要朝向"]), index=test_df.index)
], axis=1)

# 预测集
pred_encoded = encoder.transform(predict_data[["主要朝向"]])
predict_data = pd.concat([
    predict_data.drop(["房屋朝向", "主要朝向"], axis=1),
    pd.DataFrame(pred_encoded, columns=encoder.get_feature_names_out(["主要朝向"]), index=predict_data.index)
], axis=1)

# 城市_环线

In [19]:
print(train_df['环线'].isnull().sum())  # 缺失数量
print(train_df.groupby('城市')['环线'].apply(lambda x: x.isnull().mean()))  # 每个城市的缺失比例
city_region_ring_counts = train_df.groupby(['城市', '板块'])['环线'].nunique().reset_index()
city_region_ring_counts.rename(columns={'环线': '环线类别数'}, inplace=True)
# 筛选环线类别数 > 2 的 (城市, 板块)
city_region_unstable = city_region_ring_counts[city_region_ring_counts['环线类别数'] > 2]

# 显示结果
print(city_region_unstable)

# 统计这些 (城市, 板块) 组合的个数
num_unstable_regions = city_region_unstable.shape[0]
print(f"环线类别数大于 2 的 (城市, 板块) 组合个数: {num_unstable_regions}")

0
城市
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
Name: 环线, dtype: float64
     城市     板块  环线类别数
0     0    1.0      3
56    0  226.0      3
175   0  686.0      3
186   0  749.0      3
227   2   70.0      3
233   2  122.0      3
252   2  246.0      3
255   2  269.0      3
266   2  376.0      3
321   2  804.0      3
323   2  809.0      3
327   3   18.0      3
335   3   56.0      3
352   3  152.0      3
464   3  699.0      3
474   3  732.0      3
487   4    6.0      3
489   4   22.0      3
492   4   50.0      3
494   4   74.0      3
495   4  102.0      3
505   4  192.0      3
506   4  195.0      3
515   4  273.0      3
517   4  314.0      3
522   4  366.0      4
529   4  462.0      3
532   4  466.0      3
536   4  476.0      3
538   4  486.0      3
539   4  489.0      3
540   4  496.0      3
541   4  538.0      3
543   4  555.0      3
553   4  649.0      3
554   4  700.0      3
556   4  704.0      3
559   4  726.0      3
560   4  738.0      3
561   4  741.0      3
563  

In [20]:
def train_fill_missing_环线(train_df):
    """仅在训练集计算填补规则"""
   
    # 规则1: 城市1和5的环线固定为"无环线"
    city_special_rules = {1 : '无环线', 5 : '无环线'}
   
    # 规则2: 按板块→区域→城市层级计算环线众数
    板块_环线_map = train_df.groupby("板块")["环线"].apply(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()
    区域_环线_map = train_df.groupby("区域")["环线"].apply(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()
    城市_环线_map = train_df.groupby("城市")["环线"].apply(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()
   
    return {
        'city_special_rules': city_special_rules,
        '板块_环线_map': 板块_环线_map,
        '区域_环线_map': 区域_环线_map,
        '城市_环线_map': 城市_环线_map
    }

def apply_fill_missing_环线(df, fill_rules):
    """应用填补规则到任意数据集"""
   
    # 深拷贝避免修改原数据
    df = df.copy()
   
    # 应用特殊城市规则
    special_cities = fill_rules['city_special_rules'].keys()
    df.loc[df["城市"].astype(int).isin(special_cities), "环线"] = df["城市"].map(fill_rules['city_special_rules'])
   
    # 按层级填补
    df["环线_填补"] = (
        df["环线"]
        .fillna(df["板块"].map(fill_rules['板块_环线_map']))
        .fillna(df["区域"].map(fill_rules['区域_环线_map']))
        .fillna(df["城市"].map(fill_rules['城市_环线_map']))
    )
   
    return df

# 只在训练集计算填补规则
fill_rules = train_fill_missing_环线(train_df)

# 应用规则到所有数据集
train_df = apply_fill_missing_环线(train_df, fill_rules)
test_df = apply_fill_missing_环线(test_df, fill_rules)
predict_data = apply_fill_missing_环线(predict_data, fill_rules)

# 创建城市_环线变量
for df in [train_df, test_df, predict_data]:
    df["城市_环线"] = df["城市"].astype(str) + "_" + df["环线_填补"].astype(str)

In [21]:
#绘图-城市_环线与建筑面积
plt.figure(figsize=(12, 8))
sns.lmplot(
    x='建筑面积',
    y='价格',
    hue='城市_环线',
    data=train_df,
    height=6,
    aspect=1.5,
    scatter_kws={'alpha': 0.3},  # 设置点透明度
    line_kws={'linewidth': 2}    # 设置回归线粗细
)
plt.title('建筑面积 vs 价格（按城市环线分组）', fontsize=14)
plt.xlabel('建筑面积 (㎡)', fontsize=12)
plt.ylabel('价格', fontsize=12)
plt.show()

<Figure size 864x576 with 0 Axes>

In [22]:
# 初始化编码器（参数不变）
encoder = OneHotEncoder(
    sparse=True,
    handle_unknown="ignore",
    drop="first"
)

# 只在训练集拟合
encoder.fit(train_df[["城市_环线"]])

# 转换所有数据集
def encode_city_ring(df, encoder):
    encoded = encoder.transform(df[["城市_环线"]])
    
    # 修改为兼容新版pandas的代码
    try:
        # 尝试使用老方法
        encoded_df = pd.DataFrame.sparse.from_spmatrix(
            encoded,
            columns=encoder.get_feature_names_out(["城市_环线"]),
            index=df.index
        )
    except AttributeError:
        # 如果不支持，使用新方法
        encoded_df = pd.DataFrame(
            encoded.toarray(),
            columns=encoder.get_feature_names_out(["城市_环线"]),
            index=df.index
        )
    return pd.concat([df.drop("城市_环线", axis=1), encoded_df], axis=1)

train_df = encode_city_ring(train_df, encoder)
test_df = encode_city_ring(test_df, encoder)
predict_data = encode_city_ring(predict_data, encoder)

# 检查填补后缺失值
for name, df in zip(["Train", "Test", "Predict"], [train_df, test_df, predict_data]):
    print(f"{name}环线缺失数:", df["环线_填补"].isna().sum())

# 查看编码特征维度（应全部相同）
print("编码特征:", encoder.get_feature_names_out())

Train环线缺失数: 0
Test环线缺失数: 0
Predict环线缺失数: 0
编码特征: ['城市_环线_0_二环内' '城市_环线_0_二至三环' '城市_环线_0_五至六环' '城市_环线_0_六环外' '城市_环线_0_四至五环'
 '城市_环线_0_无' '城市_环线_1_无环线' '城市_环线_2_内环内' '城市_环线_2_内环至外环' '城市_环线_2_外环外'
 '城市_环线_2_无' '城市_环线_3_中环至外环' '城市_环线_3_内环内' '城市_环线_3_内环至中环' '城市_环线_3_外环外'
 '城市_环线_3_无' '城市_环线_4_一环内' '城市_环线_4_一至二环' '城市_环线_4_三至四环' '城市_环线_4_二至三环'
 '城市_环线_4_四环外' '城市_环线_4_无' '城市_环线_5_无环线' '城市_环线_6_一环内' '城市_环线_6_一至二环'
 '城市_环线_6_三环外' '城市_环线_6_二至三环' '城市_环线_6_无']


In [23]:
#创建交互项
city_ring_columns = [col for col in train_df.columns if col.startswith("城市_环线_")]
def add_interaction_terms(df, city_ring_cols):
    for col in city_ring_cols:
        df[f"{col}_建筑面积"] = df[col] * df["建筑面积"]
    return df

train_df = add_interaction_terms(train_df, city_ring_columns)
test_df = add_interaction_terms(test_df, city_ring_columns)
predict_data = add_interaction_terms(predict_data, city_ring_columns)

from sklearn.preprocessing import StandardScaler
import numpy as np

# 仅标准化交互项
interaction_columns = [col for col in train_df.columns if "城市_环线_" in col and "建筑面积" in col]

# 为每个交互项分别处理
for col in interaction_columns:
    # 检查是否是稀疏数据并适当处理
    scaler = StandardScaler(with_mean=False)
    
    # 训练集
    if hasattr(train_df[col], 'sparse'):
        # 如果是SparseArray类型
        train_df[col] = scaler.fit_transform(np.array(train_df[col]).reshape(-1, 1))
    elif isinstance(train_df[col].array, pd.arrays.SparseArray):
        # 新版pandas中的稀疏数组处理
        train_df[col] = scaler.fit_transform(train_df[col].array.to_numpy().reshape(-1, 1))
    else:
        # 常规数组处理
        train_df[col] = scaler.fit_transform(np.array(train_df[col]).reshape(-1, 1))
    
    # 测试集 (同样的处理逻辑)
    if hasattr(test_df[col], 'sparse'):
        test_df[col] = scaler.transform(np.array(test_df[col]).reshape(-1, 1))
    elif isinstance(test_df[col].array, pd.arrays.SparseArray):
        test_df[col] = scaler.transform(test_df[col].array.to_numpy().reshape(-1, 1))
    else:
        test_df[col] = scaler.transform(np.array(test_df[col]).reshape(-1, 1))
    
    # 预测集 (同样的处理逻辑)
    if hasattr(predict_data[col], 'sparse'):
        predict_data[col] = scaler.transform(np.array(predict_data[col]).reshape(-1, 1))
    elif isinstance(predict_data[col].array, pd.arrays.SparseArray):
        predict_data[col] = scaler.transform(predict_data[col].array.to_numpy().reshape(-1, 1))
    else:
        predict_data[col] = scaler.transform(np.array(predict_data[col]).reshape(-1, 1))

# 梯户比例

In [52]:
def process_elevator_household_ratio(df):
    """处理梯户比例字段"""
    # 创建副本避免修改原始数据
    df_copy = df.copy()
    
    # 填充缺失值
    df_copy['梯户比例'] = df_copy['梯户比例'].fillna('无')
    
    # 提取梯户比例中的数值
    def extract_ratio(ratio_str):
        if not isinstance(ratio_str, str) or ratio_str == '无':
            return np.nan
        
        # 匹配形如"2梯4户"的格式
        match = re.search(r'(\d+)梯(\d+)户', ratio_str)
        if match:
            elevators = int(match.group(1))
            households = int(match.group(2))
            return households / elevators if elevators > 0 else np.nan
        return np.nan
    
    # 计算每个梯的户数
    df_copy['户梯比'] = df_copy['梯户比例'].apply(extract_ratio)
    
    # 先检查是否有有效值，如果全部为NaN则无法计算中位数
    if df_copy['户梯比'].notna().any():
        # 按小区分组填充，仅在小区有足够有效值时有效
        valid_communities = df_copy.groupby('小区名称')['户梯比'].transform(lambda x: x.notna().sum() > 0)
        
        # 对有效小区进行分组填充
        for community in df_copy.loc[valid_communities, '小区名称'].unique():
            community_data = df_copy.loc[df_copy['小区名称'] == community, '户梯比']
            if community_data.notna().any():  # 确保小区有有效值
                median_value = community_data.median()
                df_copy.loc[df_copy['小区名称'] == community, '户梯比'] = community_data.fillna(median_value)
        
        # 使用全局中位数填充剩余的NaN值
        global_median = df_copy['户梯比'].median()
        if not np.isnan(global_median):  # 确保全局中位数不是NaN
            df_copy['户梯比'] = df_copy['户梯比'].fillna(global_median)
        else:
            # 如果全局中位数是NaN，则使用一个默认值
            df_copy['户梯比'] = df_copy['户梯比'].fillna(2.0)  # 假设默认值是2.0（每梯2户）
    else:
        # 如果所有值都是NaN，则使用默认值
        df_copy['户梯比'] = 2.0  # 默认值
    
    return df_copy

# 处理数据
train_df = process_elevator_household_ratio(train_df)
test_df = process_elevator_household_ratio(test_df)
predict_data = process_elevator_household_ratio(predict_data)

In [53]:
print("处理后 train_df 的缺失值统计:")
print(train_df['户梯比'].isna().sum())

print("\n处理后 test_df 的缺失值统计:")
print(test_df['户梯比'].isna().sum())

print("\n处理后 predict_data 的缺失值统计:")
print(predict_data['户梯比'].isna().sum())

处理后 train_df 的缺失值统计:
0

处理后 test_df 的缺失值统计:
0

处理后 predict_data 的缺失值统计:
0


# 厅室数量

In [25]:
# 处理房屋户型（室厅厨卫）
def extract_room_info(house_type):
    """提取房屋户型信息的函数"""
    # 检查输入是否为字符串，非字符串的情况返回默认值
    if not isinstance(house_type, str):
        return {'室': 0, '厅': 0, '厨': 0, '卫': 0}
    
    # 使用正则表达式提取室、厅、厨、卫的数据
    room_data = re.findall(r'(\d+)室|(\d+)厅|(\d+)厨|(\d+)卫', house_type)
    
    # 默认初始值为0
    room_count = {'室': 0, '厅': 0, '厨': 0, '卫': 0}
    
    # 提取匹配的数据
    for match in room_data:
        for i, category in enumerate(['室', '厅', '厨', '卫']):
            if match[i]:
                room_count[category] = int(match[i])
    
    return room_count

def process_house_type(train_df, test_df, predict_data):
    """处理所有数据集的房屋户型"""
    # 1. 填充缺失值为空字符串，避免正则表达式报错
    for df in [train_df, test_df, predict_data]:
        df['房屋户型'] = df['房屋户型'].fillna('')
    
    # 2. 提取户型信息到新列
    for df in [train_df, test_df, predict_data]:
        for room_type in ['室', '厅', '厨', '卫']:
            df[room_type] = df['房屋户型'].apply(lambda x: extract_room_info(x)[room_type])
    
    # 3. 计算训练集的中位数，用于填充缺失值
    room_medians = {
        '室': train_df['室'].median(),
        '厅': train_df['厅'].median()
    }
    
    # 4. 使用中位数和常见值填充缺失值
    for df in [train_df, test_df, predict_data]:
        df['室'] = df['室'].fillna(room_medians['室'])
        df['厅'] = df['厅'].fillna(room_medians['厅'])
        df['厨'] = df['厨'].fillna(1)  # 使用常见值1填充
        df['卫'] = df['卫'].fillna(1)  # 使用常见值1填充
    
    return train_df, test_df, predict_data

# 处理所有数据集的房屋户型
train_df, test_df, predict_data = process_house_type(train_df, test_df, predict_data)

#  区域、板块

In [26]:
train_df['log_价格'] = np.log1p(train_df['价格'])
test_df['log_价格'] = np.log1p(test_df['价格'])
# 对区域、板块进行 Target Encoding，区域板块类别多，使用独热编码维度过高
#训练集：fit_transform，需要使用目标变量 log_价格
target_enc = TargetEncoder(cols=['区域', '板块'])
train_df[['区域', '板块']] = target_enc.fit_transform(train_df[['区域', '板块']], train_df['log_价格'])
#验证集
test_df[['区域', '板块']] = target_enc.transform(test_df[['区域', '板块']])
#测试集：直接 transform（测试集没有价格信息）
predict_data[['区域', '板块']] = target_enc.transform(predict_data[['区域', '板块']])

#对 Target Encoding 后的区域和板块进行归一化到 [-1,1]，有利于后续正则化
minmax_scaler = MinMaxScaler(feature_range=(-1, 1))
train_df[['区域', '板块']] = minmax_scaler.fit_transform(train_df[['区域', '板块']])
test_df[['区域', '板块']] = minmax_scaler.transform(test_df[['区域', '板块']])
predict_data[['区域', '板块']] = minmax_scaler.transform(predict_data[['区域', '板块']])

# 自然语言处理

## 高频词

In [27]:
from collections import Counter

In [40]:
#因为核心卖点与户型介绍有很多重复的部分，所以合并起来
#train_df['卖点-户型'] = train_df['核心卖点'] + train_df['户型介绍']
#test_df['卖点-户型'] = test_df['核心卖点'] + test_df['户型介绍']
#predict_data['卖点-户型'] = predict_data['核心卖点'] + predict_data['户型介绍']

In [28]:
#寻找高频词函数
def extract_high_frequency_words(df, column_name, top_n=10):
    # 将指定列的空值替换为空字符串
    df[column_name] = df[column_name].fillna('')

    # 定义一个函数用于分词
    def tokenize(text):
        # 使用正则表达式匹配中文词语
        words = re.findall(r'[\u4e00-\u9fa5]+', text)
        return words

    # 对指定列进行分词
    all_words = []
    for description in df[column_name]:
        words = tokenize(description)
        all_words.extend(words)

    # 统计词频
    word_counts = Counter(all_words)

    # 显示前 top_n 个高频词
    return word_counts.most_common(top_n)

In [29]:
print("周边配套高频词:",extract_high_frequency_words(train_df,'周边配套', top_n=33))
print("交通出行高频词:",extract_high_frequency_words(train_df,'交通出行', top_n=20))
print("房屋优势高频词:",extract_high_frequency_words(train_df,'房屋优势', top_n=20))

周边配套高频词: [('米', 7920), ('医院', 5225), ('市', 4255), ('公里', 3388), ('建设银行', 3210), ('银行', 3201), ('工商银行', 2820), ('商场', 2589), ('路', 2431), ('公园', 2377), ('永辉', 2155), ('中国银行', 2047), ('农业银行', 1917), ('号线', 1898), ('华润万家', 1283), ('家乐福', 1275), ('沃尔玛', 1230), ('地铁', 1189), ('购物', 1100), ('医疗', 1086), ('配套齐全', 957), ('交通银行', 939), ('人人乐', 900), ('电影院', 841), ('小区', 834), ('出行方便', 827), ('生活便利', 781), ('物美', 777), ('距离', 761), ('距离小区', 737), ('交通便利', 734), ('招商银行', 703), ('万达广场', 696)]
交通出行高频词: [('路', 71557), ('米', 11689), ('号线', 10907), ('地铁', 5212), ('公里', 4534), ('出行方便', 3986), ('有', 3103), ('公交', 3072), ('交通便利', 2959), ('轻轨', 1864), ('等', 1825), ('距离', 1619), ('出行便利', 1406), ('路等', 1397), ('米左右', 1276), ('公交线路', 1204), ('线', 1134), ('路公交车', 1106), ('距离地铁', 1022), ('公交站', 924)]
房屋优势高频词: [('装修', 27847), ('地铁', 24922), ('房本满五年', 24120), ('房本满两年', 13072)]


In [32]:
# 关键词字典
keywords = {
    "交通": ["地铁", "公交", "轻轨","主干道","站点","交通","号线","路","米"],
    "配套": ["医院", "诊所","诊所","超市", "商场", "商圈","购物", "百货", "广场", "公园", "银行", "图书馆","绿地","电影院",
             "家乐福","永辉","沃尔玛","华润万家","物美","人人乐"],
    "学区": ["学校", "学区","重点","幼儿园","小学","中学","大学"],
    "税费": ["满五唯一","税费","免税","满两年","满五年"],
}

# 处理文本特征的函数
def extract_keywords(text, keyword_list):
    if pd.isna(text):
        return 0  # 缺失值填充为0
    for word in keyword_list:
        if word in text:
            return 1  # 只要出现关键词，就记为1
    return 0

# 需要处理的字段
text_columns = ["房屋优势", "核心卖点", "周边配套", "交通出行"]

#处理所有数据集
def process_text_features(df):
    for category, words in keywords.items():
        df[f"{category}_关键词"] = 0
        for col in text_columns:
            df[f"{category}_关键词"] |= df[col].apply(lambda x: extract_keywords(x, words))
    # 处理户型介绍变量
    df["户型介绍_是否填写"] = df["户型介绍"].notna().astype(int)
    return df

# 处理所有数据集
train_df = process_text_features(train_df)
test_df = process_text_features(test_df)
predict_data = process_text_features(predict_data)

# 按小区填补缺失值（针对“交通_关键词”和“配套_关键词”）
def fill_missing_by_group(df, group_col, fill_cols):
    for col in fill_cols:
        df[col] = df.groupby(group_col)[col].transform(lambda x: x.fillna(x.max()))
    return df

fill_columns = ["交通_关键词", "配套_关键词","学区_关键词"]

# 对所有数据集按小区填补
train_df = fill_missing_by_group(train_df, "小区名称", fill_columns)
test_df = fill_missing_by_group(test_df, "小区名称", fill_columns)
predict_data = fill_missing_by_group(predict_data, "小区名称", fill_columns)

# 计算匹配的关键词类别数量
def count_keyword_categories(df):
    df["关键词匹配数量"] = (
        df["交通_关键词"] + 
        df["配套_关键词"] + 
        df["税费_关键词"] +
        df["户型介绍_是否填写"]
    )
    return df

# 处理所有数据集
train_df = count_keyword_categories(train_df)
test_df = count_keyword_categories(test_df)
predict_data = count_keyword_categories(predict_data)

In [30]:
print("train_df 类型:", type(train_df))

train_df 类型: <class 'pandas.core.frame.DataFrame'>


# 楼层处理

In [33]:
def extract_floor_info(s):
    try:
        
        # 优先检查是否包含地下室
        if "地下室" in s:
            # 地下室逻辑：总楼层设为1，当前楼层设为0或-1（根据业务需求）
            return -1, 1  # 当前楼层=0（表示地下室），总楼层=1
        total = re.search(r'共(\d+)层', s).group(1)
        level = re.search(r'(\d+)层', s.split('(')[0])

        if level:
            current = level.group(1)
        else:
            desc = s.split('(')[0].strip()
            total = int(total)
            if '低' in desc:
                current = total * 0.25
            elif '中' in desc:
                current = total * 0.5
            elif '高' in desc:
                current = total * 0.75
            elif '底' in desc:
                current = 1
            else:
                current = total * 1
        return int(float(current)), int(total)
    except:
        return np.nan, np.nan

for df in [train_df,test_df,predict_data]:
    df[['当前楼层', '总楼层']] = df['所在楼层'].apply(
        lambda x: pd.Series(extract_floor_info(x) if isinstance(x, str) else (np.nan, np.nan)))

# 检查 数据集 里是否有 '总楼层'
print(train_df.columns)  
print(predict_data.columns)

def add_floor_features(*dfs):
    """
    批量添加楼层相关特征到多个DataFrame（直接修改原DataFrame）
    参数:
        *dfs: 一个或多个包含"总楼层"和"当前楼层"的DataFrame
    """
    for df in dfs:
        # 1. 基本特征
        df["多层住宅"] = (df["总楼层"] <= 6).astype(int)
        
        # 2. 黄金楼层特征
        df["黄金楼层"] = (
            ((df["多层住宅"] == 1) & (df["当前楼层"].between(2, 4))) |
            ((df["多层住宅"] == 0) & (df["当前楼层"] >= (df["总楼层"] * 0.5)))
        ).astype(int)
        
        # 3. 楼层占比特征
        df["楼层占比"] = df["当前楼层"] / df["总楼层"]
        df["楼层占比_平方"] = df["楼层占比"] ** 2
        
        # 4. 交互特征
        df["楼层_电梯交互"] = df["楼层占比"] * df["多层住宅"]
        df["楼层_高层交互"] = df["楼层占比"] * (1 - df["多层住宅"])

# 处理所有数据集
add_floor_features(train_df, test_df, predict_data)

Index(['城市', '区域', '板块', '环线', '小区名称', '价格', '房屋户型', '所在楼层', '建筑面积', '梯户比例',
       ...
       '卫', 'log_价格', '交通_关键词', '配套_关键词', '学区_关键词', '税费_关键词', '户型介绍_是否填写',
       '关键词匹配数量', '当前楼层', '总楼层'],
      dtype='object', length=107)
Index(['城市', '区域', '板块', '环线', '小区名称', '房屋户型', '所在楼层', '建筑面积', '梯户比例', '配备电梯',
       ...
       '厨', '卫', '交通_关键词', '配套_关键词', '学区_关键词', '税费_关键词', '户型介绍_是否填写',
       '关键词匹配数量', '当前楼层', '总楼层'],
      dtype='object', length=106)


In [34]:
def extract_floor_info(s):
    try:
        if not isinstance(s, str):
            return np.nan, np.nan
            
        # 处理地下室情况
        if "地下室" in s:
            return -1, 1
            
        # 匹配总楼层：从括号内提取"共XX层"
        total_match = re.search(r'共(\d+)层', s)
        if not total_match:
            return np.nan, np.nan
        total = int(total_match.group(1))
        
        # 匹配当前楼层描述（括号前部分）
        desc = s.split('(')[0].strip()
        
        # 处理明确楼层数字的情况（如"3层 (共26层)"）
        level_match = re.search(r'(\d+)层', desc)
        if level_match:
            return int(level_match.group(1)), total
            
        # 处理文字描述（低/中/高/底层）
        if '低' in desc:
            if total <= 2:  # 总楼层为1或2时，当前楼层取1
                current = 1
            else:
                current = round(total * 0.25)
        elif '中' in desc:
            current = round(total * 0.5)
        elif '高' in desc:
            current = round(total * 0.75)
        elif '底' in desc:
            current = 1
        else:  # 顶层
            current = total
            
        return min(int(current), total), total  # 确保不超过总楼层
    except:
        return np.nan, np.nan

for df in [train_df,test_df,predict_data]:
    df[['当前楼层', '总楼层']] = df['所在楼层'].apply(
        lambda x: pd.Series(extract_floor_info(x) if isinstance(x, str) else (np.nan, np.nan)))

In [25]:
# 检查训练集
print("训练集中当前楼层为0的记录数量:", train_df[train_df["当前楼层"] == 0].shape[0])

nan_count = train_df['当前楼层'].isna().sum()
print("当前楼层中NaN值的个数:", nan_count)

In [35]:
def add_floor_features(*dfs):
    """
    批量添加楼层相关特征到多个DataFrame（直接修改原DataFrame）
    参数:
        *dfs: 一个或多个包含"总楼层"和"当前楼层"的DataFrame
    """
    for df in dfs:
        # 1. 基本特征
        df["多层住宅"] = (df["总楼层"] <= 6).astype(int)
        
        # 2. 黄金楼层特征
        df["黄金楼层"] = (
            ((df["多层住宅"] == 1) & (df["当前楼层"].between(2, 4))) |
            ((df["多层住宅"] == 0) & (df["当前楼层"] >= (df["总楼层"] * 0.5)))
        ).astype(int)
        
        # 3. 楼层占比特征
        df["楼层占比"] = df["当前楼层"] / df["总楼层"]
        df["楼层占比_平方"] = df["楼层占比"] ** 2
        
        # 4. 交互特征
        df["楼层_电梯交互"] = df["楼层占比"] * df["多层住宅"]
        df["楼层_高层交互"] = df["楼层占比"] * (1 - df["多层住宅"])

# 使用示例：一次性处理所有数据集
add_floor_features(train_df, test_df, predict_data)

In [36]:
# 检查训练集
print("训练集中楼层占比为0的记录数量:", train_df[train_df["楼层占比"] == 0].shape[0])

nan_count = train_df['楼层占比'].isna().sum()
print("楼层占比中NaN值的个数:", nan_count)

训练集中楼层占比为0的记录数量: 0
楼层占比中NaN值的个数: 0


# 房龄处理

In [38]:
for df in [train_df, test_df, predict_data]:
    df['房龄'] = df['交易年份'] - df['建筑年代']

# 检查房龄范围
print(train_df['房龄'].describe())

count    64438.000000
mean        17.078773
std         10.267413
min          1.000000
25%          9.000000
50%         14.000000
75%         23.000000
max         69.000000
Name: 房龄, dtype: float64


In [39]:
print(train_df.groupby("多层住宅")["房龄"].describe())
# 计算相关性矩阵
corr_matrix = train_df[['房龄','多层住宅','建筑面积',"总楼层","容 积 率"]].corr()
print(corr_matrix['房龄'])  # 观察房龄与面积、总楼层的相关性
import seaborn as sns
# 绘制分组箱线图
import matplotlib.pyplot as plt
# 设置图形风格
sns.set_style("whitegrid")
plt.figure(figsize=(8, 6))

# 绘制箱线图
ax = sns.boxplot(x="多层住宅", y="房龄", data=train_df, palette="Set2")

# 添加标题和坐标轴标签
ax.set_title("Boxplot of Building Age by Multi-Story Residential", fontsize=14, fontweight='bold')
ax.set_xlabel("Multi-Story Residential", fontsize=12)
ax.set_ylabel("Building Age", fontsize=12)

# 调整刻度字体大小
ax.tick_params(axis='both', labelsize=10)

# 显示图形
plt.show()
# 检查缺失比例
missing_rate_by_group = train_df.groupby("多层住宅")["房龄"].apply(lambda x: x.isna().mean())
print(missing_rate_by_group)

        count       mean        std  min   25%   50%   75%   max
多层住宅                                                            
0     52240.0  15.456432   9.244435  1.0   9.0  13.0  20.0  54.0
1     12198.0  24.026726  11.461494  1.0  15.0  23.0  33.0  69.0
房龄       1.000000
多层住宅     0.326996
建筑面积    -0.174725
总楼层     -0.413368
容 积 率   -0.128771
Name: 房龄, dtype: float64


多层住宅
0    0.044798
1    0.029517
Name: 房龄, dtype: float64


In [40]:
#1. 在训练集上计算各组的房龄中位数
median_age_multilayer = train_df[train_df["多层住宅"] == 1]["房龄"].median()  # 多层住宅（1）的中位数
median_age_non_multilayer = train_df[train_df["多层住宅"] == 0]["房龄"].median()  # 非多层住宅（0）的中位数

# 2. 定义填充函数
def fill_age_by_building_type(df, median_multilayer, median_non_multilayer):
    # 处理极端值
    df.loc[df["房龄"] > 100, "房龄"] = 100
    df.loc[df["房龄"] < 0, "房龄"] = np.nan
    
    # 分组填充缺失值
    df["房龄"] = df.apply(
        lambda row: median_multilayer if (pd.isna(row["房龄"]) and row["多层住宅"] == 1)
                   else (median_non_multilayer if (pd.isna(row["房龄"]) and row["多层住宅"] == 0)
                   else row["房龄"]),
        axis=1
    )
    return df

# 3. 应用填充
train_df = fill_age_by_building_type(train_df, median_age_multilayer, median_age_non_multilayer)
test_df = fill_age_by_building_type(test_df, median_age_multilayer, median_age_non_multilayer)
predict_data = fill_age_by_building_type(predict_data, median_age_multilayer, median_age_non_multilayer)

 #检查房龄列中的NaN数量
nan_count = train_df['房龄'].isna().sum()
print("房龄中NaN值的个数:", nan_count)

房龄中NaN值的个数: 0


In [41]:
train_df['面积_房龄'] = train_df['area_scaled'] * train_df['房龄']
test_df['面积_房龄'] = test_df['area_scaled'] * test_df['房龄']
predict_data['面积_房龄']=predict_data['area_scaled'] * predict_data['房龄']

# 对房龄做非线性转换
train_df['房龄_平方'] = train_df['房龄'] ** 2
test_df['房龄_平方'] = test_df['房龄'] ** 2
predict_data['房龄_平方'] = predict_data['房龄'] ** 2

# 对新特征进行标准化
new_features = ['面积_房龄', '房龄_平方']
scaler_new = StandardScaler()
train_df[new_features] = scaler_new.fit_transform(train_df[new_features])
test_df[new_features] = scaler_new.transform(test_df[new_features])
predict_data[new_features] = scaler_new.transform(predict_data[new_features])

# 需要标准化的特征
standard_features = ['房龄', '楼层占比', '关键词匹配数量','楼层占比_平方', '楼层_电梯交互', '楼层_高层交互']

# 对连续特征进行标准化
scaler = StandardScaler()
train_df[standard_features] = scaler.fit_transform(train_df[standard_features])
test_df[standard_features] = scaler.transform(test_df[standard_features])
predict_data[standard_features] = scaler.transform(predict_data[standard_features])

In [42]:
# 检查数据集中的列
print("当前所有列名:", train_df.columns.tolist())

当前所有列名: ['城市', '区域', '板块', '环线', '小区名称', '价格', '房屋户型', '所在楼层', '建筑面积', '梯户比例', '配备电梯', '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行', '匹配小区', '建筑年代', '房屋总数', '楼栋总数', '绿 化 率', '容 积 率', '停车位', '停车费用', '平均每平米月租金', '交易年份', '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_毛坯', '装修编码_简装', '装修编码_精装', 'area_scaled', 'area_squared', '主要朝向_北', '主要朝向_南', '主要朝向_西', '环线_填补', '城市_环线_0_二环内', '城市_环线_0_二至三环', '城市_环线_0_五至六环', '城市_环线_0_六环外', '城市_环线_0_四至五环', '城市_环线_0_无', '城市_环线_1_无环线', '城市_环线_2_内环内', '城市_环线_2_内环至外环', '城市_环线_2_外环外', '城市_环线_2_无', '城市_环线_3_中环至外环', '城市_环线_3_内环内', '城市_环线_3_内环至中环', '城市_环线_3_外环外', '城市_环线_3_无', '城市_环线_4_一环内', '城市_环线_4_一至二环', '城市_环线_4_三至四环', '城市_环线_4_二至三环', '城市_环线_4_四环外', '城市_环线_4_无', '城市_环线_5_无环线', '城市_环线_6_一环内', '城市_环线_6_一至二环', '城市_环线_6_三环外', '城市_环线_6_二至三环', '城市_环线_6_无', '城市_环线_0_二环内_建筑面积', '城市_环线_0_二至三环_建筑面积', '城市_环线_0_五至六环_建筑面积', '城市_环线_0_六环外_建筑面积', '城市_环线_0_四至五环_建筑面积', '城市_环线_0_无_建筑面积', '城市_环线_1_无环线_建筑面积', '城市_环线_2_内环内_建筑面积', '城市_环线_2_内环至外环_建筑面积', '城市_环线_2_外环外_建筑面积', '城市_环线_2_无_建筑面积', '城市_环线_3_中

# 训练模型

In [43]:
# 检查 DataFrame 是否包含稀疏列
if any(pd.api.types.is_sparse(train_df[col]) for col in train_df.columns):
    print("DataFrame 包含稀疏列！")
else:
    print("DataFrame 不含稀疏列，可能是普通数值数据。")

# 如果数据是 pandas.DataFrame 且含稀疏列
for col in train_df.columns:
    if pd.api.types.is_sparse(train_df[col]):
        train_df[col] = train_df[col].sparse.to_dense()  # 逐列转换
for col in test_df.columns:
    if pd.api.types.is_sparse(test_df[col]):
        test_df[col] = test_df[col].sparse.to_dense()
for col in predict_data.columns:
    if pd.api.types.is_sparse(predict_data[col]):
        predict_data[col] = predict_data[col].sparse.to_dense()

DataFrame 包含稀疏列！


In [46]:
#lasso alpha与特征选择
# 特征选择
selected_features =['区域','板块','area_scaled', 'area_squared','配备电梯', '平均每平米月租金',
               '绿 化 率', '容 积 率', '停车位', '停车费用','室', '厅', '厨', '卫','户梯比',
               '房龄', '楼层占比','面积_房龄','楼层_电梯交互', '楼层_高层交互','关键词匹配数量','学区_关键词',
               '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_简装', '装修编码_精装','装修编码_毛坯',
                *city_ring_columns,*[f"{col}_建筑面积" for col in city_ring_columns],
               '主要朝向_北', '主要朝向_南', '主要朝向_西']

# 准备数据
y_train = train_df['log_价格']
y_test = test_df['log_价格']
X_train = train_df[selected_features]
X_test = test_df[selected_features]

# Lasso参数网格搜索
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso = Lasso(max_iter=10000)  # 增加迭代次数确保收敛
grid_search = GridSearchCV(lasso, param_grid, cv=6, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("Best alpha for Lasso:", grid_search.best_params_['alpha'])

# 获取最佳模型
best_lasso = grid_search.best_estimator_  

# 查看特征系数
lasso_coefs = pd.DataFrame({
    'Feature': X_train.columns, 
    'Coefficient': best_lasso.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

# 获取非零系数特征
important_features = lasso_coefs[lasso_coefs['Coefficient'] != 0]['Feature'].tolist()

print("\n重要特征（按系数绝对值排序）：")
print(lasso_coefs)

print("\n最终选择的非零特征：")
print(important_features)

In [57]:
from sklearn.linear_model import ElasticNetCV

selected_features =['区域','板块','area_scaled', 'area_squared','配备电梯', '平均每平米月租金',
               '绿 化 率', '容 积 率', '停车位', '停车费用','室', '厅', '厨', '卫','户梯比',
               '房龄', '楼层占比','面积_房龄','楼层_电梯交互', '楼层_高层交互','关键词匹配数量','学区_关键词',
               '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_简装', '装修编码_精装','装修编码_毛坯',
                *city_ring_columns,*[f"{col}_建筑面积" for col in city_ring_columns],
               '主要朝向_北', '主要朝向_南', '主要朝向_西']
# 因变量
y_train = train_df['log_价格']
y_test = test_df['log_价格']

X_train = train_df[selected_features]
X_test = test_df[selected_features]

# 定义一组 alpha 和 l1_ratio 值
alphas = np.logspace(-4, 2, 20)
l1_ratios = np.linspace(0.1, 0.9, 9)

# 使用 ElasticNetCV
elastic_cv = ElasticNetCV(
    alphas=alphas, l1_ratio=l1_ratios, 
    cv=5, max_iter=10000, random_state=42
)
elastic_cv.fit(X_train, y_train)

# 获取最佳参数和系数
best_alpha = elastic_cv.alpha_
best_l1_ratio = elastic_cv.l1_ratio_
coefficients = elastic_cv.coef_

# 查看非零系数对应的特征
features = X_train.columns
important_features = [(feature, coef) for feature, coef in zip(features, coefficients) if coef != 0]
important_features.sort(key=lambda x: abs(x[1]), reverse=True)

# 打印重要特征及其系数
for feature, coef in important_features:
    print(f"{feature}: {coef:.6f}")

In [60]:
print(best_alpha)
print(best_l1_ratio)
print(coefficients)

In [33]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

selected_features =['区域','板块','area_scaled', 'area_squared','配备电梯', '平均每平米月租金',
               '绿 化 率', '容 积 率', '停车位', '停车费用','室', '厅', '厨', '卫', '户梯比',
               '房龄', '楼层占比','面积_房龄','楼层_电梯交互', '楼层_高层交互','关键词匹配数量','学区_关键词',
               '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_简装', '装修编码_精装','装修编码_毛坯',
                *city_ring_columns,*[f"{col}_建筑面积" for col in city_ring_columns],
               '主要朝向_北', '主要朝向_南', '主要朝向_西']

# 准备数据
X = train_df[selected_features]
y = train_df['log_价格']

# 添加常数项
X = sm.add_constant(X)

# 初始模型（包含所有特征）
model = sm.OLS(y, X).fit()

# 逐步向后剔除
p_threshold = 0.05
while True:
    p_values = model.pvalues.drop('const', errors='ignore')
    max_p_value = p_values.max()
    
    if max_p_value > p_threshold:
        excluded_feature = p_values.idxmax()
        X = X.drop(excluded_feature, axis=1)
        model = sm.OLS(y, X).fit()
        print(f"移除特征 {excluded_feature} (p-value: {max_p_value:.4f})")
    else:
        break

# 最终模型的特征
final_features = list(X.columns)
if 'const' in final_features:
    final_features.remove('const')
print("最终保留的特征:", final_features)

In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score, KFold

# 显示使用数据量
total_samples = len(train_df) + len(test_df)
print("共使用数据量：", total_samples, "条（训练集：", len(train_df), "，测试集：", len(test_df), "）")

selected_features =['区域','板块','area_scaled', 'area_squared','配备电梯', '平均每平米月租金',
               '绿 化 率', '容 积 率', '停车位', '停车费用','室', '厅', '厨', '卫', '户梯比',
               '房龄', '楼层占比','面积_房龄','楼层_电梯交互', '楼层_高层交互','关键词匹配数量','学区_关键词',
               '建筑结构编码_混合结构', '建筑结构编码_钢混结构', '装修编码_简装', '装修编码_精装','装修编码_毛坯',
                *city_ring_columns,*[f"{col}_建筑面积" for col in city_ring_columns],
               '主要朝向_北', '主要朝向_南', '主要朝向_西']
# 因变量
y_train = train_df['log_价格']
y_test = test_df['log_价格']

X_train = train_df[selected_features]
X_test = test_df[selected_features]

# 定义评估函数同时计算MAE和RMSE
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # 训练模型
    model.fit(X_train, y_train)
    
    # 预测
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # 计算原始价格的评估指标 (将log转换回原始价格)
    y_train_original = np.exp(y_train)
    y_train_pred_original = np.exp(y_train_pred)
    y_test_original = np.exp(y_test)
    y_test_pred_original = np.exp(y_test_pred)
    
    # 训练集指标
    train_mae = mean_absolute_error(y_train_original, y_train_pred_original)
    train_rmse = np.sqrt(mean_squared_error(y_train_original, y_train_pred_original))
    
    # 测试集指标
    test_mae = mean_absolute_error(y_test_original, y_test_pred_original)
    test_rmse = np.sqrt(mean_squared_error(y_test_original, y_test_pred_original))
    
    # 6-fold 交叉验证
    cv = KFold(n_splits=6, shuffle=True, random_state=111)
    
    # 定义MAE scorer
    def calculate_mae_on_original(y_true, y_pred):
        return mean_absolute_error(np.exp(y_true), np.exp(y_pred))
    
    # 定义RMSE scorer
    def calculate_rmse_on_original(y_true, y_pred):
        return np.sqrt(mean_squared_error(np.exp(y_true), np.exp(y_pred)))
    
    # 计算交叉验证指标
    mae_scorer = make_scorer(calculate_mae_on_original, greater_is_better=False)
    rmse_scorer = make_scorer(calculate_rmse_on_original, greater_is_better=False)
    
    cv_mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer)
    cv_rmse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=rmse_scorer)
    
    cv_mae = -np.mean(cv_mae_scores)  # 取负号还原正数
    cv_rmse = -np.mean(cv_rmse_scores)  # 取负号还原正数
    
    return {
        'train_mae': train_mae,
        'train_rmse': train_rmse,
        'test_mae': test_mae,
        'test_rmse': test_rmse,
        'cv_mae': cv_mae,
        'cv_rmse': cv_rmse
    }

# 使用最佳alpha值初始化模型
ols_model = LinearRegression()
lasso_model = Lasso(alpha=0.001)
ridge_model = Ridge(alpha=0.001)
Elastic_model = ElasticNet(alpha=0.001, l1_ratio=0.1, max_iter=10000, tol=1e-4)  # 调整迭代次数和容差

# 评估所有模型
ols_metrics = evaluate_model(ols_model, X_train, y_train, X_test, y_test)
lasso_metrics = evaluate_model(lasso_model, X_train, y_train, X_test, y_test)
ridge_metrics = evaluate_model(ridge_model, X_train, y_train, X_test, y_test)
elasticnet_metrics = evaluate_model(Elastic_model, X_train, y_train, X_test, y_test)

# 假设Ridge是最佳模型
best_model_metrics = ridge_metrics

# 创建MAE结果表格
mae_results = {
    'Metrics': ['OLS', 'LASSO', 'Ridge', 'Elastic Net','Best Model'],
    'In sample': [
        round(ols_metrics['train_mae'], 2),
        round(lasso_metrics['train_mae'], 2),
        round(ridge_metrics['train_mae'], 2),
        round(elasticnet_metrics['train_mae'], 2),
        round(best_model_metrics['train_mae'], 2)
    ],
    'out of sample': [
        round(ols_metrics['test_mae'], 2),
        round(lasso_metrics['test_mae'], 2),
        round(ridge_metrics['test_mae'], 2),
        round(elasticnet_metrics['test_mae'], 2),
        round(best_model_metrics['test_mae'], 2)
    ],
    'Cross-validation': [
        round(ols_metrics['cv_mae'], 2),
        round(lasso_metrics['cv_mae'], 2),
        round(ridge_metrics['cv_mae'], 2),
        round(elasticnet_metrics['cv_mae'], 2),
        round(best_model_metrics['cv_mae'], 2)
    ],
}

# 创建RMSE结果表格
rmse_results = {
    'Metrics': ['OLS', 'LASSO', 'Ridge', 'Elastic_Net','Best Model'],
    'In sample': [
        round(ols_metrics['train_rmse'], 2),
        round(lasso_metrics['train_rmse'], 2),
        round(ridge_metrics['train_rmse'], 2),
        round(elasticnet_metrics['train_rmse'], 2),
        round(best_model_metrics['train_rmse'], 2)
    ],
    'out of sample': [
        round(ols_metrics['test_rmse'], 2),
        round(lasso_metrics['test_rmse'], 2),
        round(ridge_metrics['test_rmse'], 2),
        round(elasticnet_metrics['test_rmse'], 2),
        round(best_model_metrics['test_rmse'], 2)
    ],
    'Cross-validation': [
        round(ols_metrics['cv_rmse'], 2),
        round(lasso_metrics['cv_rmse'], 2),
        round(ridge_metrics['cv_rmse'], 2),
        round(elasticnet_metrics['cv_rmse'], 2),
        round(best_model_metrics['cv_rmse'], 2)
    ],
}

# 创建DataFrame
mae_df = pd.DataFrame(mae_results)
mae_df = mae_df.set_index('Metrics')

rmse_df = pd.DataFrame(rmse_results)
rmse_df = rmse_df.set_index('Metrics')

# 显示结果表格
print("\n模型性能评估结果 (MAE):")
print(mae_df)
print("\n模型性能评估结果 (RMSE):")
print(rmse_df)
print("\n注意: 指标是原始房价水平的MAE和RMSE")

共使用数据量： 84081 条（训练集： 67259 ，测试集： 16822 ）

模型性能评估结果 (MAE):
             In sample  out of sample  Cross-validation
Metrics                                                
OLS          381926.69      382751.24         383052.33
LASSO        406057.75      406974.28         407124.28
Ridge        381927.16      382751.76         383052.89
Elastic Net  393707.73      393686.77         394769.69
Best Model   381927.16      382751.76         383052.89

模型性能评估结果 (RMSE):
              In sample  out of sample  Cross-validation
Metrics                                                 
OLS          1041018.43     1059860.57        1038458.58
LASSO        1114216.44     1155082.93        1115091.49
Ridge        1041020.04     1059860.41        1038460.75
Elastic_Net  1085341.62     1099357.98        1084997.00
Best Model   1041020.04     1059860.41        1038460.75

注意: 指标是原始房价水平的MAE和RMSE


In [51]:
# Ridge alpha
# 定义交叉验证方法
cv = KFold(n_splits=6, shuffle=True, random_state=111)

# 定义一系列alpha值进行测试
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# 使用RidgeCV自动选择最佳alpha
ridge_cv = RidgeCV(alphas=alphas, cv=cv, scoring='neg_mean_absolute_error')
ridge_cv.fit(X_train, y_train)

# 获取最佳alpha值
best_alpha = ridge_cv.alpha_
print(f"最佳alpha值: {best_alpha}")

最佳alpha值: 0.001


In [50]:
import numpy as np
import pandas as pd

# 1. 选取特征
id_col = predict_data['ID'].copy()
predict_features = predict_data.drop(columns=['ID'])
predict_features=predict_features[selected_features]
# 2. 使用训练好的模型进行预测（模型预测的是 log_价格）
pred_log_price = ridge_model.predict(predict_features)

# 3. 将预测的 log_价格 还原为原始价格
pred_price = np.exp(pred_log_price)

# 4. 将预测价格转换为整数
pred_price_int = np.round(pred_price).astype(int)

# 5. 构造结果 DataFrame
submission = pd.DataFrame({
    'ID': id_col,
    'Price': pred_price_int
})

# 6. 保存为 CSV 文件
submission.to_csv("prediction_OLS.csv", index=False)
print("预测结果已保存到 prediction_OLS.csv")

预测结果已保存到 prediction_OLS.csv
