In [1]:
! pip install cn2an -U

Collecting cn2an
  Downloading cn2an-0.5.23-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.9/224.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting proces>=0.1.7
  Downloading proces-0.1.7-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.7/137.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: proces, cn2an
Successfully installed cn2an-0.5.23 proces-0.1.7


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.spatial import KDTree

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDRegressor, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score, confusion_matrix
from sklearn.decomposition import PCA

from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import WLS

from cn2an import cn2an
import re
from typing import Literal

from tqdm import tqdm
from collections import Counter
from itertools import product
from functools import partial
import multiprocessing as mlp

In [3]:
pd.set_option('display.max_columns', None)

<font face='华文中宋' size=4>  
一、数据集准备  
</font>

In [4]:
data_tr = pd.read_csv("/home/mw/input/quant4533/ruc_Class25Q1_train.csv")

data_tr['num_index'] = data_tr.index

data_te = pd.read_csv("/home/mw/input/quant4533/ruc_Class25Q1_test.csv").drop("ID", axis='columns')

data_te['num_index'] = data_te.index
data_te['价格'] = np.nan

data = pd.concat([data_tr, data_te], axis=0, ignore_index=True)

<font face='华文中宋' size=4>  
二、量化字符串数据  
</font>

In [5]:
data.drop(labels=['环线','别墅类型', '抵押信息'], axis='columns', inplace=True)  # 不考虑环线、别墅类型、抵押信息

In [6]:
# 房屋户型
def HuXingTrans(s):
    if not isinstance(s, str):  # 检查输入是否为字符串
        return 4*[np.nan]  # 如果不是返回四个NaN值
    strs = re.split('室|厅|厨|卫|房间', s)[:-1]  # 分隔符拆分
    if len(strs) == 2:
        return [eval(strs[0]), 0, 0, eval(strs[1])]
    return [eval(i) for i in strs]

ls = list(data['房屋户型'].apply(HuXingTrans))

data[['室', '厅', '厨', '卫']] = ls

data[['室', '厅', '厨', '卫']].astype('float')  # 包含室厅厨卫四个float值的列表

data.drop(labels='房屋户型', axis='columns', inplace=True)

In [7]:
# 所在楼层
def LouCengTrans(s):
    if not isinstance(s, str):
        return 2*[np.nan]
    strs = re.split(' \(共|层\)', s)[:-1]
    return [strs[0], eval(strs[1])]

ls = list(data['所在楼层'].apply(LouCengTrans)) 

data[['所在层数', '最高层数']] = ls

data['最高层数'].astype('float')  # 包含所在楼层（底层低楼层中楼层高楼层顶层）以及最高层数（float值）两个值的列表

data.drop(labels='所在楼层', axis='columns', inplace=True)

In [8]:
# 合并套内面积和建筑面积
data['建筑面积'] = data['建筑面积'].str.replace('㎡', '').astype('float64')
data['套内面积'] = data['套内面积'].str.replace('㎡', '').astype('float64')
data['建筑面积'] = data['建筑面积'].where(np.isnan(data['套内面积']), data['套内面积'])  # 判断套内面积是否为空，否则用套内面积代替建筑面积

data.drop(labels='套内面积', axis='columns', inplace=True)  # 删除套内面积列保留建筑面积

In [9]:
# 房屋朝向
def ChaoXiangTrans(s):
    ls = 8 * [0.]
    if not isinstance(s, str):
        return 8 * [np.nan]
    vals_dict = {'东':0, '西':1, '南':2, '北':3, '东北':4, '东南':5, '西北':6, '西南':7}
    s_vals = set(re.split(' ', s))
    for v in s_vals:
        ls[vals_dict[v]] += 1.
    return ls

ls = list(data['房屋朝向'].apply(ChaoXiangTrans))

data[['朝东', '朝西', '朝南', '朝北', '朝东北', '朝东南', '朝西北', '朝西南']] = ls

data.drop(labels='房屋朝向', axis='columns', inplace=True)

In [10]:
# 梯户比例
def TiHuTrans(s):
    if not isinstance(s, str):
        return 2*[np.nan]
    sp_s = re.split('梯|户', s)[:-1]
    return [cn2an(i, 'smart') for i in sp_s]  # 中文数字转阿拉伯数字

ls = list(data['梯户比例'].apply(TiHuTrans))

data[['梯数', '户数']] = ls

data.drop(labels='梯户比例', axis='columns', inplace=True)

In [11]:
# 房屋用途
def YongTuTrans(s):
    if not isinstance(s, str):
        return 2*[np.nan]
    ls = [0, 0]
    if s in ['住宅式公寓', '公寓', '公寓/住宅', '公寓/公寓', '公寓（住宅）', '别墅', '商住两用', '四合院', '平房', '新式里弄', '普通住宅', '老公寓', '花园洋房', '酒店式公寓']:
        ls[0] = 1
    elif s in ['写字楼', '商业', '商业办公类', '商住两用', '底商', '车库',]:
        ls[1] = 1
    return ls

ls = list(data['房屋用途'].apply(YongTuTrans))

data[['住宅用', '商业用']] = ls

data.drop(labels='房屋用途', axis='columns', inplace=True)

In [12]:
# 房屋年限
age_dict = {'满五年': 5.0, '满两年': 2.0, '未满两年': 0.0}

data['房屋年限'] = [age_dict[data.loc[idx, '房屋年限']] 
                    if isinstance(data.loc[idx, '房屋年限'], str) 
                        else np.nan
                        for idx in data.index]   

In [13]:
# 房屋优势
def YouShiTrans(s):
    if not isinstance(s, str):
        return 2*[np.nan]  # 不考虑装修与否是因为有更详细的装修情况
    ls = 2 * [0]
    sp_s = re.split('、', s)
    if '地铁' in sp_s:
        ls[0] = 1
    if '房本满两年' in sp_s:
        ls[1] = 2
    elif '房本满五年' in sp_s:
        ls[1] = 5
    return ls

ls = list(data['房屋优势'].apply(YouShiTrans))

data[['地铁', '房本年限']] = ls

data.drop(labels='房屋优势', axis='columns', inplace=True)

In [14]:
# 配备电梯和产权所属
OE = OrdinalEncoder(
    categories=[
        ['无', '有'],
        ['非共有', '共有']
    ], 
    handle_unknown='use_encoded_value', 
    unknown_value=np.nan)  # 序数编码器对配备电梯和产权所属进行数值分类，缺失值NaN

data[['配备电梯', '产权所属']] = OE.fit_transform(data[['配备电梯', '产权所属']])

In [15]:
# 所在层数、建筑结构、装修情况和交易权属
OHE = OneHotEncoder(
    categories=[
        ['地下室', '底层', '低楼层', '中楼层', '高楼层', '顶层'],  # 所在层数
        ['框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构'],  # 建筑结构
        ['毛坯', '简装', '精装'],  # 装修情况
        ['一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房']],  # 交易权属
    sparse_output=False,
    handle_unknown='ignore')

data[['地下室', '底层', '低楼层', '中楼层', '高楼层', '顶层', 
      '框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构', 
      '毛坯', '简装', '精装',
      '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房']] = OHE.fit_transform(data[['所在层数', '建筑结构', '装修情况', '交易权属']])

data.loc[data['建筑结构'].isnull(), '建筑结构'] = '未知结构'

data.loc[data['装修情况'].isnull(), '装修情况'] = '其他'

data.drop(labels=['所在层数', '建筑结构', '装修情况', '交易权属'], axis='columns', inplace=True)

In [16]:
data.describe()

Unnamed: 0,城市,区域,板块,价格,建筑面积,配备电梯,房屋年限,产权所属,lon,lat,年份,num_index,室,厅,厨,卫,朝东,朝西,朝南,朝北,朝东北,朝东南,朝西北,朝西南,梯数,户数,住宅用,商业用,地铁,房本年限,地下室,底层,低楼层,中楼层,高楼层,顶层,框架结构,混合结构,砖木结构,砖混结构,钢混结构,钢结构,毛坯,简装,精装,一类经济适用房,二类经济适用房,使用权,动迁安置房,售后公房,商品房,央产房,定向安置房,已购公房,房改房,拆迁还建房,私产,经济适用房,自住型商品房,限价商品房,集资房
count,98919.0,98919.0,98919.0,84133.0,98919.0,89306.0,67393.0,98919.0,98919.0,98919.0,98919.0,98919.0,98312.0,98312.0,98312.0,98312.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,96942.0,96942.0,98917.0,98917.0,81843.0,81843.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0
mean,2.862595,54.101083,433.179854,1971953.0,90.675039,0.768168,3.39059,0.179986,113.678653,35.151817,2021.209778,36883.149678,2.449599,1.405851,0.989615,1.372071,0.095674,0.067581,0.685834,0.398407,0.015407,0.120543,0.019652,0.039477,1.939108,5.863733,0.979367,0.020633,0.446379,2.811603,0.007137,0.023575,0.27939,0.362155,0.301873,0.02587,0.028023,0.114033,0.000617,0.051042,0.768225,0.010665,0.110191,0.216763,0.38022,0.004812,0.007309,0.000526,0.006066,0.000576,0.919217,0.001992,0.000758,0.031561,0.001911,0.010433,0.002628,0.005449,0.000485,0.003609,0.002669
std,2.118844,27.966695,225.81098,2639962.0,59.593821,0.422005,1.924877,0.384177,6.63401,5.508274,0.879562,25637.2518,0.942219,0.595888,0.15383,0.626287,0.294145,0.251026,0.464185,0.489573,0.123164,0.325597,0.138804,0.194727,1.216302,5.711456,0.142155,0.142155,0.49712,2.16565,0.08418,0.151721,0.448702,0.480626,0.459073,0.158747,0.165039,0.317853,0.024825,0.220084,0.421968,0.102721,0.313129,0.412042,0.485443,0.069202,0.08518,0.022922,0.077646,0.023998,0.272503,0.044582,0.027525,0.17483,0.043669,0.101607,0.051201,0.073616,0.022023,0.059967,0.051592
min,0.0,0.0,0.0,78280.0,1.0,0.0,0.0,0.0,106.19742,29.258156,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,34.0,244.0,697400.0,61.745,1.0,2.0,0.0,106.606972,29.717963,2021.0,12364.5,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,59.0,433.0,1146500.0,83.0,1.0,5.0,0.0,114.276243,34.221395,2021.0,34673.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,76.0,628.0,2176000.0,109.0,1.0,5.0,0.0,116.678048,39.920243,2022.0,59402.5,3.0,2.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,7.0,1.0,0.0,1.0,5.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,102.0,810.0,79950000.0,10337.0,1.0,5.0,1.0,129.581642,46.340684,2023.0,84132.0,16.0,7.0,5.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,84.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<font face='华文中宋' size=4>  
三、异常值处理（建筑面积）  
</font>

In [17]:
# 通过建筑面积-价格散点图观察异常值
data.plot(kind='scatter', x='建筑面积', y='价格', alpha=0.4, c='purple') 

<AxesSubplot: xlabel='建筑面积', ylabel='价格'>

In [18]:
# 找寻并修改异常值
data_post = data.copy()

gp = set(data['小区名称'])
with tqdm(total=len(gp)) as pbar:
    pbar.set_description('Processing:')
    for base in gp:
        data_gp = data[data['小区名称']==base] # 指定小区的数据集
        if len(data_gp.index) <= 2: # 少于两个样本的，无法辨别哪个是异常值，不做修改
            pbar.update(1)
            continue

        ind = data_gp.index
        for idx in ind:
            tg_val = data_gp.loc[idx, '建筑面积']
            data_gp_d = data_gp.drop(labels=idx, axis='index', inplace=False)
            med = data_gp_d['建筑面积'].median()  # 删去待检测异常值后计算剩下中位数
            
            # 这里小数点出错的概率较高，因此数据容易变成真值的 0.1,0.01,10,100倍。
            # 因此，如果小数点出现异常，对数值应该至少上下变动 ln10 = 2.30，我们把 2 作为一个阈值。
            if tg_val != np.nan and np.fabs(np.log(tg_val) - np.log(med)) > 2.0:
                cand_ls = np.array([tg_val*0.01, tg_val*0.1, tg_val, tg_val*10, tg_val*100]) # 怀疑是小数点错误
                best_id = np.argmin(np.fabs(np.log(cand_ls) - np.log(med))) # 找与中位数最近的数
                data_post.loc[idx, '建筑面积'] = cand_ls[best_id]
        pbar.update(1)

Processing:: 100%|██████████| 3200/3200 [01:04<00:00, 49.58it/s]


In [19]:
# 建筑面积分布直方图检查是否还存在异常
plt.figure(figsize=(12, 6))
sns.histplot(data_post['建筑面积'], bins=30, kde=True, color='skyblue')
plt.title('建筑面积分布直方图', fontsize=15, pad=20)
plt.xlabel('建筑面积(㎡)', fontsize=12)
plt.ylabel('房源数量', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

<font face='华文中宋' size=4>  
四、缺失值处理  
</font>

<font face='华文中宋' size=3>  
1、找寻存在缺失值的指标  
</font>

In [20]:
data.isnull().sum().to_frame("num").T

Unnamed: 0,城市,区域,板块,小区名称,价格,建筑面积,配备电梯,交易时间,上次交易,房屋年限,产权所属,核心卖点,户型介绍,周边配套,交通出行,lon,lat,年份,num_index,室,厅,厨,卫,最高层数,朝东,朝西,朝南,朝北,朝东北,朝东南,朝西北,朝西南,梯数,户数,住宅用,商业用,地铁,房本年限,地下室,底层,低楼层,中楼层,高楼层,顶层,框架结构,混合结构,砖木结构,砖混结构,钢混结构,钢结构,毛坯,简装,精装,一类经济适用房,二类经济适用房,使用权,动迁安置房,售后公房,商品房,央产房,定向安置房,已购公房,房改房,拆迁还建房,私产,经济适用房,自住型商品房,限价商品房,集资房
num,0,0,0,0,14786,0,9613,0,30607,31526,0,18220,73644,39085,37509,0,0,0,0,607,607,607,607,0,0,0,0,0,0,0,0,0,1977,1977,2,2,17076,17076,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<font face='华文中宋' size=3>  
2、填补缺失值  
</font>

<font face='华文中宋' size=3>  
1）配备电梯  
</font>

In [21]:
# 用其他指标信息填补
def IsInStr(s:list[str], ss:str):
    return any([(ss in sss) if isinstance(sss, str) else False for sss in s])

data['电梯 in strs'] = data[['核心卖点','户型介绍']].apply(IsInStr, args=('电梯',), axis=1) # 核心卖点和户型介绍种出现“电梯”字眼则赋值为1

data['电梯 in strs'].sum()

4519

In [22]:
# 定义logistic回归插补函数
def ClassifyImputer(df: pd.DataFrame, 
                        y_col: str, 
                        X_col: str, 
                        method, 
                        mode: Literal['predict','check'],
                        score='roc_auc', 
                        cross_cv: int=5):
    """
    使用分类模型对分类变量进行插补,或使用交叉验证检测插补的效果.
    """
    X_smp, y_smp, X_unk = df.loc[~df[y_col].isnull(), X_col], df.loc[~df[y_col].isnull(), y_col], \
                            df.loc[df[y_col].isnull(), X_col]

    if mode=='check':
        scores = cross_val_score(method, X_smp, y_smp, scoring=score, cv=cross_cv, n_jobs=-1)
        print(scores)
        print(f"mean score is {scores.mean()}.\n")
    elif mode=='predict':
        method.fit(X_smp, y_smp.astype(int))
        y_unk = method.predict_proba(X_unk)
        w = np.array([[i for i in range(y_unk.shape[1])]]).T
        y_unk = ((y_unk @ w).T)[0]
        return y_unk

def RegressiveImputer(df: pd.DataFrame, 
                        y_col: str, 
                        X_col: str, 
                        method, 
                        mode: Literal['predict','check'],
                        score='neg_mean_squared_error', 
                        cross_cv: int=5):
    """
    使用回归模型对连续性变量进行插补,或使用交叉验证检测插补的效果.
    """
    X_smp, y_smp, X_unk = df.loc[~df[y_col].isnull(), X_col], df.loc[~df[y_col].isnull(), y_col], \
                            df.loc[df[y_col].isnull(), X_col]

    if mode=='check':
        scores = cross_val_score(method, X_smp, y_smp, scoring=score, cv=cross_cv, n_jobs=-1)
        print(scores)
        print(f"mean score is {scores.mean()}.\n")
    elif mode=='predict':
        method.fit(X_smp, y_smp.astype(int))
        y_unk = method.predict(X_unk)
        return y_unk

In [23]:
# 检验logistic回归效果
ClassifyImputer(df=data, y_col='配备电梯', X_col=['框架结构', '混合结构', '砖木结构', '砖混结构',
       '钢混结构', '钢结构', '毛坯', '简装', '精装', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房', '电梯 in strs'], 
       method=LogisticRegression(max_iter=1000, n_jobs=-1), score='roc_auc', mode='check')

[0.95107389 0.90759577 0.81071546 0.70988584 0.85763986]
mean score is 0.8473821649568162.



In [24]:
# logistic回归填补剩下缺失值
data.loc[data['配备电梯'].isnull(), '配备电梯'] = ClassifyImputer(
            df=data, y_col='配备电梯', X_col=['框架结构', '混合结构', '砖木结构', '砖混结构',
            '钢混结构', '钢结构', '毛坯', '简装', '精装', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房', '电梯 in strs'], 
            method=LogisticRegression(max_iter=1000, n_jobs=-1), score='roc_auc', mode='predict')

<font face='华文中宋' size=3>  
2）住宅用/商业用  
</font>  


In [25]:
# 缺失值仅两个，直接删除
data = data.loc[~(data['住宅用'].isnull()|data['商业用'].isnull())]

<font face='华文中宋' size=3>  
3）地铁  
</font>  


In [26]:
# 用其他指标信息
data['地铁 in strs'] = data[['核心卖点','户型介绍', '周边配套', '交通出行']].apply(IsInStr, args=('地铁',), axis=1)
data.loc[data['地铁'].isnull() & data['地铁 in strs']==1, '地铁'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['地铁 in strs'] = data[['核心卖点','户型介绍', '周边配套', '交通出行']].apply(IsInStr, args=('地铁',), axis=1)


In [27]:
# 假设同一个小区的地铁便利程度是一样的
with tqdm(total=len(set(data['小区名称']))) as pbar:
    for dst in set(data['小区名称']):
        if (data.loc[data['小区名称']==dst, '地铁']==1).any():
            data.loc[data['小区名称']==dst, '地铁'] = 1
        pbar.update(1)

100%|██████████| 3200/3200 [00:32<00:00, 98.02it/s] 


In [28]:
# 检查是否还有缺失值
data['地铁'].isnull().sum()

4374

In [29]:
# 用KNN插补法填补剩下缺失值
knI = KNNImputer(n_neighbors=10, weights='distance')
data[['地铁', 'lon', 'lat']] = knI.fit_transform(data[['地铁', 'lon', 'lat']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['地铁', 'lon', 'lat']] = knI.fit_transform(data[['地铁', 'lon', 'lat']])


<font face='华文中宋' size=3>  
4）房本年限  
</font>  

In [30]:
# 房本年限比房屋年限缺失值更少，考虑保留房本年限
# 用房屋年限信息补充
data.loc[(data['房本年限'].isna()) & (data['房屋年限'] == 2.0), '房屋年限'] = 2
data.loc[(data['房本年限'].isna()) & (data['房屋年限'] == 5.0), '房屋年限'] = 5

In [31]:
# 用其他指标信息补充
cond0 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('不满一',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('不满1',), axis=1)

cond1 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('满一',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满1',), axis=1)

cond2 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('满二',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满2',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满两',), axis=1)

cond3 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('满三',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满3',), axis=1)

cond4 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('满四',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满4',), axis=1)

cond5 = data[['核心卖点','户型介绍']].apply(IsInStr, args=('满五',), axis=1) \
        | data[['核心卖点','户型介绍']].apply(IsInStr, args=('满5',), axis=1)

data.loc[cond0, '房本年限'] = 0
data.loc[cond1 & (~cond0), '房本年限'] = 1
data.loc[cond2, '房本年限'] = 2
data.loc[cond3, '房本年限'] = 3
data.loc[cond4, '房本年限'] = 4
data.loc[cond5, '房本年限'] = 5

In [32]:
# 检查是否还有缺失值
data['房本年限'].isnull().sum()

15628

In [33]:
# 关于房本还有需要注意的信息
data['满五'] = (data['房本年限'] == 5)
data['唯一'] = data[['核心卖点','户型介绍']].apply(IsInStr, args=('唯一',), axis=1)
data['房本缺失'] = data['房本年限'].isnull()
data.loc[data['房本年限'].isnull(), '房本年限'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['满五'] = (data['房本年限'] == 5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['唯一'] = data[['核心卖点','户型介绍']].apply(IsInStr, args=('唯一',), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['房本缺失'] = data['房本年限'].isnull()


<font face='华文中宋' size=3>  
5）房屋户型和梯户比例  
</font> 

In [34]:
# 相似度
def SimImputer(df: pd.DataFrame, y_col: list[str], tgcol: list[str], basecol: str):
    """
    选取相似度最高的子样本集，进行插补。
    """
    Sim = lambda row1, row2: sum(row1==row2)
    with tqdm(total=len(set(df[basecol]))) as pbar:
        for ele in set(df[basecol]):
            df_unk = df.loc[(df[basecol]==ele) & df[y_col].isnull().any(axis='columns')]
            if len(df_unk) == 0:
                pbar.update(1)
                continue
            for idx, row in df_unk.iterrows():
                df_smp = df.loc[(df[basecol]==ele) & (~df[y_col].isnull().any(axis='columns'))]
                if len(df_smp) > 0:
                    sim_val = df_smp.apply(Sim, args=(row, ), axis=1)
                    sim_ls = df_smp.loc[sim_val==max(sim_val)]
                    df.loc[idx, y_col] = sim_ls[y_col].median()
            pbar.update(1)

In [35]:
# 对房屋户型
SimImputer(df=data, y_col=['室', '厅', '厨', '卫'], 
            tgcol=['最高层数', '框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构', 
            '梯数', '户数', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房'], basecol='小区名称')

100%|██████████| 3200/3200 [00:41<00:00, 76.91it/s] 


In [36]:
# 对梯户比例
SimImputer(df=data, y_col=['梯数', '户数'], 
            tgcol=['室', '厅', '厨', '卫', '最高层数', '框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构', 
            '梯数', '户数', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房'], basecol='小区名称')

100%|██████████| 3200/3200 [01:02<00:00, 51.42it/s] 


In [37]:
data['户数'].isnull().sum()

235

In [38]:
# 房屋户型已完整，基于建筑面积平均比例填补剩余梯数户数缺失值
Ti_per_area = (data['梯数'] / data['建筑面积']).mean()
Hu_per_area = (data['户数'] / data['建筑面积']).mean()

data_unk = data.loc[data['梯数'].isnull()]

for idx, row in data_unk.iterrows():
    data.loc[idx, '梯数'] = Ti_per_area * row['建筑面积']
    data.loc[idx, '户数'] = Hu_per_area * row['建筑面积']

In [39]:
data.drop(labels=['上次交易', '房屋年限', '核心卖点', '户型介绍', '周边配套', '交通出行'], axis='columns', inplace=True)
data.loc[:, data.isnull().sum()>0].isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(labels=['上次交易', '房屋年限', '核心卖点', '户型介绍', '周边配套', '交通出行'], axis='columns', inplace=True)


价格    14786
dtype: int64

In [40]:
data.to_csv("/home/mw/project/train_test_cleaned.csv", index=False)

<font face='华文中宋' size=4>  
五、线性模型构建  
</font>

In [41]:
numerical_fs = ['建筑面积', '配备电梯', '产权所属', 
                'lon', 'lat', '室', '厅', '厨', '卫', '最高层数', 
                '朝东', '朝西', '朝南', '朝北', '朝东北', '朝东南', '朝西北', '朝西南', 
                '梯数', '户数', '地铁', '房本年限', '住宅用', '商业用', 
                '地下室', '底层', '低楼层', '中楼层', '高楼层', '顶层',
                '框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构', 
                '毛坯', '简装', '精装', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房']

data[numerical_fs] = data[numerical_fs].astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_fs] = data[numerical_fs].astype('float64')


In [42]:
# 因子补充
data['lon^2'] = data['lon'] ** 2
data['lon*lat'] = data['lon'] * data['lat']
data['lat^2'] = data['lat'] ** 2
data['log最高层数'] = np.log(1+data['最高层数'])
data['log室'] = np.log(1+data['室'])
data['log厅'] = np.log(1+data['厅'])
data['log厨'] = np.log(1+data['厨'])
data['log卫'] = np.log(1+data['卫'])
data['梯户比例'] = data['户数'] / data['梯数']
data['log建筑面积'] = np.log(data['建筑面积'])
data['满五唯一'] = data['满五']*data['唯一']
data['log梯数'] = np.log(1+data['梯数'])
data['log户数'] = np.log(1+data['户数'])

data['单价'] = data['价格'] / data['建筑面积']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lon^2'] = data['lon'] ** 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lon*lat'] = data['lon'] * data['lat']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lat^2'] = data['lat'] ** 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

<font face='华文中宋' size=3>  
1）交叉验证  
</font>

In [43]:
regression_fs = ['建筑面积', 'log建筑面积', '配备电梯', '产权所属', 
                'lon', 'lat', 'lon^2', 'lat^2', 'lon*lat', '室', 'log室', '厅', 'log厅', '厨', 'log厨', '卫', 'log卫', '最高层数', 'log最高层数', 
                '朝东', '朝西', '朝南', '朝北', '朝东北', '朝东南', '朝西北', '朝西南', 
                '梯数', 'log梯数', '户数', 'log户数', '梯户比例', '地铁', '房本年限', '满五唯一', '住宅用', '商业用', 
                '地下室', '底层', '低楼层', '中楼层', '高楼层', '顶层',
                '框架结构', '混合结构', '砖木结构', '砖混结构', '钢混结构', '钢结构', 
                '毛坯', '简装', '精装', '一类经济适用房', '二类经济适用房', '使用权', '动迁安置房', '售后公房', '商品房', '央产房', '定向安置房', '已购公房', '房改房', '拆迁还建房', '私产', '经济适用房', '自住型商品房',
 '限价商品房', '集资房']

data_tr, data_te = data.loc[~data['单价'].isnull()], data.loc[data['单价'].isnull()]

X = data_tr[regression_fs].copy()
y = data_tr['单价'].copy()

In [44]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111, test_size=0.2)

In [45]:
# OLS
lm = LinearRegression(n_jobs=-1)
cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean()

-9647.454283074958

In [46]:
np.sqrt(-cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean())

98.22145530929053

In [47]:
# LASSO
lm = Lasso(alpha=0.1, random_state=111)
cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-11517.414321559949

In [48]:
np.sqrt(-cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean())

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


107.31921692576753

In [49]:
# Ridge
rm = Ridge(alpha=31)
cross_val_score(rm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean()

-9640.654098224642

In [50]:
np.sqrt(-cross_val_score(rm, X_train, y_train, scoring='neg_mean_absolute_error', cv=6, n_jobs=-1).mean())

98.18683261122462

<font face='华文中宋' size=3>  
2）报告表现  
</font>

In [51]:
ols = LinearRegression(n_jobs=-1)
ols.fit(X_train, y_train)
area_train = X_train['建筑面积']
area_test = X_test['建筑面积']

train_preds = ols.predict(X_train)
test_preds = ols.predict(X_test)

print("OLS Model")

print("训练集:")
print(f"R^2: {r2_score(y_train*area_train, train_preds*area_train)}")
print(f"RMSE: {mean_squared_error(y_train*area_train, train_preds*area_train, squared=False)}")
print(f"MAE: {mean_absolute_error(y_train*area_train, train_preds*area_train)}")
print(f"中位数绝对误差: {median_absolute_error(y_train*area_train, train_preds*area_train)}")

print("测试集:")
print(f"R^2: {r2_score(y_test*area_test, test_preds*area_test)}")
print(f"RMSE: {mean_squared_error(y_test*area_test, test_preds*area_test, squared=False)}")
print(f"MAE: {mean_absolute_error(y_test*area_test, test_preds*area_test)}")
print(f"中位数绝对误差: {median_absolute_error(y_test*area_test, test_preds*area_test)}")

OLS Model
训练集:
R^2: -39.16927829178524
RMSE: 16797129.298935886
MAE: 945668.8023225641
中位数绝对误差: 458980.71024594887
测试集:
R^2: 0.5686356593352578
RMSE: 1706540.2443025173
MAE: 868636.2462995559
中位数绝对误差: 454319.53829272673


In [52]:
lasso = Lasso(alpha=0.1, random_state=111)
lasso.fit(X_train, y_train)
area_train = X_train['建筑面积']
area_test = X_test['建筑面积']

train_preds = lasso.predict(X_train)
test_preds = lasso.predict(X_test)

print("Lasso Model")

print("训练集:")
print(f"R^2: {r2_score(y_train*area_train, train_preds*area_train)}")
print(f"RMSE: {mean_squared_error(y_train*area_train, train_preds*area_train, squared=False)}")
print(f"MAE: {mean_absolute_error(y_train*area_train, train_preds*area_train)}")
print(f"中位数绝对误差: {median_absolute_error(y_train*area_train, train_preds*area_train)}")

print("测试集:")
print(f"R^2: {r2_score(y_test*area_test, test_preds*area_test)}")
print(f"RMSE: {mean_squared_error(y_test*area_test, test_preds*area_test, squared=False)}")
print(f"MAE: {mean_absolute_error(y_test*area_test, test_preds*area_test)}")
print(f"中位数绝对误差: {median_absolute_error(y_test*area_test, test_preds*area_test)}")

  model = cd_fast.enet_coordinate_descent(


Lasso Model
训练集:
R^2: -38.32908033471459
RMSE: 16620532.975730622
MAE: 1097125.0653991005
中位数绝对误差: 603077.7831841307
测试集:
R^2: 0.4974624306473734
RMSE: 1841953.5816766515
MAE: 1013855.6139291
中位数绝对误差: 605009.7676612255


In [53]:
ridge = Ridge(alpha=31)
ridge.fit(X_train, y_train)
area_train = X_train['建筑面积']
area_test = X_test['建筑面积']

train_preds = ridge.predict(X_train)
test_preds = ridge.predict(X_test)

print("Ridge Model")

print("训练集:")
print(f"R^2: {r2_score(y_train*area_train, train_preds*area_train)}")
print(f"RMSE: {mean_squared_error(y_train*area_train, train_preds*area_train, squared=False)}")
print(f"MAE: {mean_absolute_error(y_train*area_train, train_preds*area_train)}")
print(f"中位数绝对误差: {median_absolute_error(y_train*area_train, train_preds*area_train)}")

print("测试集:")
print(f"R^2: {r2_score(y_test*area_test, test_preds*area_test)}")
print(f"RMSE: {mean_squared_error(y_test*area_test, test_preds*area_test, squared=False)}")
print(f"MAE: {mean_absolute_error(y_test*area_test, test_preds*area_test)}")
print(f"中位数绝对误差: {median_absolute_error(y_test*area_test, test_preds*area_test)}")

Ridge Model
训练集:
R^2: -37.456596369205265
RMSE: 16435142.45673412
MAE: 942798.5143517233
中位数绝对误差: 458300.7160156738
测试集:
R^2: 0.5758611714766915
RMSE: 1692187.2962630887
MAE: 865368.413239058
中位数绝对误差: 455099.86553829815


#### 表现总结  


***MAE:***  

| Models | In Sample | Out of Sample | Cross-validation | Datahub Score |  
|:----:|:----:|:----:|:----:|:----:|  
|OLS|945668.80|868636.25|9647.45|46.823|  
|LASSO|1097125.07|1013855.61|11517.41|39.907|  
|Ridge|942798.51|865368.41|9640.65|46.784|  
|Best Model|945668.80|868636.25|9647.45|46.823|  

***Best Model:*** **Ridge Regression**

In [54]:
X_final = data_te[regression_fs]
area = data_te['建筑面积']

y_final_ols = ols.predict(X_final) * area

submit_final = pd.DataFrame({
    'ID': data_te['num_index'],
    'Price': y_final_ols
})

submit_final.to_csv("/home/mw/project/prediction_ols.csv", index=False)

In [55]:
X_final = data_te[regression_fs]
area = data_te['建筑面积']

y_final_lasso = lasso.predict(X_final) * area

submit_final = pd.DataFrame({
    'ID': data_te['num_index'],
    'Price': y_final_lasso
})

submit_final.to_csv("/home/mw/project/prediction_lasso.csv", index=False)

In [56]:
X_final = data_te[regression_fs]
area = data_te['建筑面积']

y_final_ridge = ridge.predict(X_final) * area

submit_final = pd.DataFrame({
    'ID': data_te['num_index'],
    'Price': y_final_ridge
})

submit_final.to_csv("/home/mw/project/prediction_ridge.csv", index=False)

In [None]:
print(f"总预测数量: {len(y_final_ridge)} 条")