## 欢迎进入 Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [1]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

lasso_model.pkl  prediction.csv  selected_features.pkl	word2vec_model.model


In [2]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

ls: cannot access '/home/mw/input/': No such file or directory


# 读取并查看数据

In [3]:
import subprocess
import sys

# 检测模块是否安装
def install_and_import(package):
    try:
        __import__(package)
        print(f"{package} 已经安装")
    except ImportError:
        print(f"{package} 未安装，正在安装...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"{package} 安装成功！")
        __import__(package)

# 模块名称列表，名称使用pip中的标准模块名称
modules = [
    "numpy", "pandas", "random", "re", "time", "warnings", "pickle",
    "sklearn", "joblib", "numba", "gensim", "matplotlib", "seaborn",
    "geopy", "openpyxl", "tensorflow","tqdm"
]

for module in modules:
    install_and_import(module)

numpy 已经安装
pandas 已经安装
random 已经安装
re 已经安装
time 已经安装
pickle 已经安装
sklearn 已经安装
joblib 已经安装
numba 已经安装
gensim 未安装，正在安装...
Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/e3/43/4feed7d79a69d886197a83389b6728ecaaa8839e51472da1228a818a69a7/gensim-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading gensim-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/7a/18/9a8d9f01957aa1f8bbc5676d54c2e33102d247e146c1a3679d3bd5cc2e3a/smart_open-7.1.0-py3-none-any.whl.metadata
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m24.6 MB/s[0m eta [36m0:00

In [4]:
# 基础模块
import numpy as np
import pandas as pd
import random
import re
import time
import warnings
import pickle

# 数据处理与预处理模块
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.linear_model import SGDRegressor
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 机器学习模型模块
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error,r2_score
from sklearn.neighbors import BallTree

# 并行与加速模块
from joblib import Parallel, delayed
from sklearn.utils import parallel_backend
from numba import njit

# 自然语言处理模块
from gensim.models import Word2Vec

# 数据可视化模块
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm

# 地理计算模块
from geopy.distance import geodesic

# Excel 处理模块
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment

# 深度学习模块
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

# 其他模块
import joblib
from tqdm import tqdm


In [5]:
# 加载训练集和测试集
df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
predict = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')

# 加载小区详细数据和租价数据
details = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_details.csv')
rent = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_rent.csv')

In [6]:
# 查看训练数据集的前几行
df.head()

In [7]:
#查看数据类型
df.info()

In [8]:
# 检查训练数据集每列的缺失值情况
df.isnull().sum()

In [9]:
from sklearn.model_selection import train_test_split
# X 是特征，y 是目标变量
X = df.drop(columns=['价格'])  # 特征列
y = df['价格']  # 目标列（例如房价、分类标签等）

# 使用 train_test_split 进行划分，80% 训练集，20% 测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

# 确保分割后的数据有相同的行数
print(f"X_train行数: {X_train.shape[0]}, y_train行数: {y_train.shape[0]}")
print(f"X_test行数: {X_test.shape[0]}, y_test行数: {y_test.shape[0]}")

# 如果行数一致但索引不同，可以重置索引
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# 查看训练集和测试集的大小
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")

# 预测集
X_predict = predict

In [10]:
# 基于IQR的异常值检测和处理
def handle_outliers(X_data, y_data, threshold=3):
    # 计算目标变量的IQR
    Q1 = y_data.quantile(0.25)
    Q3 = y_data.quantile(0.75)
    IQR = Q3 - Q1
    
    # 定义边界
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    # 找出非异常样本的索引
    mask = (y_data >= lower_bound) & (y_data <= upper_bound)
    
    # 记录异常值数量
    print(f"在{len(y_data)}个样本中识别出{(~mask).sum()}个异常值")
    
    # 返回过滤后的数据集
    return X_data[mask], y_data[mask]

# 应用异常值处理
X_train, y_train = handle_outliers(X_train, y_train, threshold=3.5)

In [11]:
# 查看小区详细信息数据集的前几行
details.head()

In [12]:
# 检查训练数据集每列的缺失值情况
details.isnull().sum()

In [13]:
# 查看租价数据集的前几行
rent.head()

In [14]:
# 检查训练数据集每列的缺失值情况
rent.isnull().sum()

# 特征工程

In [15]:
# 创建新数据框以存放处理后的数据
X_train_cleaned = pd.DataFrame(index=X_train.index)
X_test_cleaned = pd.DataFrame(index=X_test.index)
X_predict_cleaned = pd.DataFrame(index=X_predict.index)

In [16]:
# 导入数据
for col in ['区域', '板块', '小区名称', '城市', '交易权属', '房屋用途', '产权所属']:
    X_train_cleaned[col] = X_train[col]
    X_test_cleaned[col] = X_test[col]
    X_predict_cleaned[col] = X_predict[col]

# 补充部分列的缺失值
def fill_missing_values(df, df_cleaned, column, fill_value='未知'):
    # 填补缺失值
    df[column] = df[column].fillna(fill_value)
    # 将处理后的列放入清洗后的数据框中
    df_cleaned[column] = df[column]

for col in ['建筑结构', '装修情况', '别墅类型']:
    fill_missing_values(X_train, X_train_cleaned, col)
    fill_missing_values(X_test, X_test_cleaned, col)
    fill_missing_values(X_predict, X_predict_cleaned, col)


In [17]:
# 1. 先将经纬度转换为弧度，以便于使用BallTree
X_train['lat_rad'] = np.radians(X_train['lat'])
X_train['lon_rad'] = np.radians(X_train['lon'])

X_test['lat_rad'] = np.radians(X_test['lat'])
X_test['lon_rad'] = np.radians(X_test['lon'])

X_predict['lat_rad'] = np.radians(X_predict['lat'])
X_predict['lon_rad'] = np.radians(X_predict['lon'])

# 2. 使用训练集中有环线信息的数据构建BallTree
train_with_ring = X_train[X_train['环线'].notnull()].copy()
coords_train_with_ring = np.vstack((train_with_ring['lat_rad'], train_with_ring['lon_rad'])).T
tree_train_with_ring = BallTree(coords_train_with_ring, metric='haversine')

# 3. 创建辅助函数，用于填补缺失的环线数据
def fill_missing_ring_using_balltree(row, df_with_ring, tree_with_ring, lat_col='lat', lon_col='lon', ring_col='环线'):
    # 如果当前行有环线数据，直接返回原值
    if pd.notnull(row[ring_col]):
        return row[ring_col]
    
    # 当前行的经纬度（转换为弧度）
    current_location_rad = np.radians([row[lat_col], row[lon_col]]).reshape(1, -1)
    
    # 查找最近的小区，限定在有环线数据的小区范围内
    dist, idx = tree_with_ring.query(current_location_rad, k=1)  # 只找一个最近的小区
    nearest_idx = idx[0][0]  # 取出最近小区的索引
    
    # 返回最近小区的环线值
    return df_with_ring.iloc[nearest_idx][ring_col]

# 4. 填补训练集中的缺失值（使用训练集内的最近邻小区数据填补）
X_train_cleaned['环线'] = X_train.apply(
    lambda row: fill_missing_ring_using_balltree(row, train_with_ring, tree_train_with_ring), axis=1
)

# 5. 填补测试集中的缺失值（使用训练集中的最近邻小区数据填补）
X_test_cleaned['环线'] = X_test.apply(
    lambda row: fill_missing_ring_using_balltree(row, train_with_ring, tree_train_with_ring), axis=1
)

X_predict_cleaned['环线'] = X_predict.apply(
    lambda row: fill_missing_ring_using_balltree(row, train_with_ring, tree_train_with_ring), axis=1
)

# 6. 删除临时添加的经纬度弧度列（lat_rad 和 lon_rad）
X_train.drop(columns=['lat_rad', 'lon_rad'], inplace=True)
X_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)

In [18]:
# 消除链式赋值的警告
pd.options.mode.chained_assignment = None

# 处理“配备电梯”列：根据“梯户比例”判断
def process_elevator(df, df_cleaned, elevator_col, ratio_col):
    # 根据“梯户比例”列是否有值判断是否配备电梯
    df[elevator_col] = df[ratio_col].notna().map({True: '有', False: '无'})
    fill_missing_values(df, df_cleaned, elevator_col, '无')

# 应用“配备电梯”列的处理函数
process_elevator(X_train, X_train_cleaned, '配备电梯', '梯户比例')
process_elevator(X_test, X_test_cleaned, '配备电梯', '梯户比例')
process_elevator(X_predict, X_predict_cleaned, '配备电梯', '梯户比例')

In [19]:
# 定义标准的八个方向
standard_directions = ['东', '南', '西', '北', '东南', '东北', '西南', '西北']

# 定义一个方向映射规则，将“南东”标准化为“东南”
direction_mapping = {
    '南东': '东南',
    '北东': '东北',
    '南西': '西南',
    '北西': '西北',
    '东南': '东南',
    '东北': '东北',
    '西南': '西南',
    '西北': '西北'
}

# 优化房屋朝向处理
def clean_directions_optimized(series):
    # 用空格分割各个方向
    direction_split = series.str.split()
    
    # 通过替换将“南东”等非标准化的方向转换为标准方向
    for key, value in direction_mapping.items():
        direction_split = direction_split.apply(lambda x: [value if d == key else d for d in x])
    
    # 将方向列表去重，并通过 MultiLabelBinarizer 生成布尔特征
    mlb = MultiLabelBinarizer(classes=standard_directions)
    return pd.DataFrame(mlb.fit_transform(direction_split), columns=mlb.classes_, index=series.index)

# 应用优化后的函数到房屋朝向列，并将结果添加
X_train_cleaned = pd.concat([X_train_cleaned, clean_directions_optimized(X_train['房屋朝向'])], axis=1)
X_test_cleaned = pd.concat([X_test_cleaned, clean_directions_optimized(X_test['房屋朝向'])], axis=1)
X_predict_cleaned = pd.concat([X_predict_cleaned, clean_directions_optimized(X_predict['房屋朝向'])], axis=1)

# 优化房屋优势处理，去除空白字符
def process_advantages_optimized(series):
    # 按“、”分割房屋优势，去除空格和空字符串
    advantage_split = series.fillna('').apply(lambda x: [item.strip() for item in x.split('、') if item.strip()])
    
    # 通过 MultiLabelBinarizer 生成布尔特征
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(mlb.fit_transform(advantage_split), columns=mlb.classes_, index=series.index)

# 应用优化后的函数到房屋优势列，并将结果添加
X_train_cleaned = pd.concat([X_train_cleaned, process_advantages_optimized(X_train['房屋优势'])], axis=1)
X_test_cleaned = pd.concat([X_test_cleaned, process_advantages_optimized(X_test['房屋优势'])], axis=1)
X_predict_cleaned = pd.concat([X_predict_cleaned, process_advantages_optimized(X_predict['房屋优势'])], axis=1)


In [20]:
# 1. 定义处理“房屋年限”的函数
def process_house_age(age_info):
    if age_info == '满五年':
        return 5
    elif age_info == '满两年':
        return 2
    elif age_info == '未满两年':
        return 0
    else:
        return None  # 如果是缺失值或不匹配，返回 None

# 2. 处理训练集的“房屋年限”列
X_train_cleaned['房屋年限'] = X_train['房屋年限'].apply(process_house_age)

# 3. 计算训练集中“房屋年限”的中位数（用于填补缺失值）
train_median_age = X_train_cleaned['房屋年限'].median()

# 4. 处理测试集的“房屋年限”列
X_test_cleaned['房屋年限'] = X_test['房屋年限'].apply(process_house_age)
X_predict_cleaned['房屋年限'] = X_predict['房屋年限'].apply(process_house_age)


# 5. 填补训练集中的缺失值
X_train_cleaned['房屋年限'] = X_train_cleaned['房屋年限'].fillna(train_median_age)

# 6. 填补测试集中的缺失值，使用训练集的中位数
X_test_cleaned['房屋年限'] = X_test_cleaned['房屋年限'].fillna(train_median_age)
X_predict_cleaned['房屋年限'] = X_predict_cleaned['房屋年限'].fillna(train_median_age)


In [21]:
# 处理缺失值，确保所有数据为字符串类型
for col in ['核心卖点', '户型介绍', '周边配套', '交通出行']:
    X_train[col] = X_train[col].fillna('未知').astype(str)
    X_test[col] = X_test[col].fillna('未知').astype(str)
    X_predict[col] = X_predict[col].fillna('未知').astype(str)

# 定义文本清理函数
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 去除标点符号
    text = text.lower()  # 转为小写
    return text

# 应用文本清理函数
for col in ['核心卖点', '户型介绍', '周边配套', '交通出行']:
    X_train[col] = X_train[col].apply(clean_text)
    X_test[col] = X_test[col].apply(clean_text)
    X_predict[col] = X_predict[col].apply(clean_text)

# 将文本列转换为单词列表
for col in ['核心卖点', '户型介绍', '周边配套', '交通出行']:
    X_train[col] = X_train[col].apply(lambda x: x.split())
    X_test[col] = X_test[col].apply(lambda x: x.split())
    X_predict[col] = X_predict[col].apply(lambda x: x.split())

# 合并训练集所有列，创建训练语料库
sentences_train = X_train['核心卖点'].tolist() + X_train['户型介绍'].tolist() + X_train['周边配套'].tolist() + X_train['交通出行'].tolist()

# 训练Word2Vec模型（仅基于训练集）
w2v_model = Word2Vec(sentences_train, vector_size=100, window=5, min_count=1, workers=4)

# 保存Word2Vec模型
w2v_model.save("word2vec_model.model")

# 载入模型以验证保存是否成功
w2v_model = Word2Vec.load("word2vec_model.model")

# 获取训练好的词向量
word_vectors = w2v_model.wv

# 定义函数，将文本行转化为词向量的平均值
def get_average_word2vec(text, model, vector_size):
    feature_vector = np.zeros((vector_size,), dtype='float32')
    num_words = 0
    for word in text:
        if word in model:
            num_words += 1
            feature_vector = np.add(feature_vector, model[word])
    if num_words > 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector.tolist()  # 将NumPy数组转换为Python列表

# 为训练集和测试集每个列生成词向量特征
X_train_backup = X_train.copy()  # 确保有数据副本
X_test_backup = X_test.copy()
X_predict_backup = X_predict.copy()

for col in ['核心卖点', '户型介绍', '周边配套', '交通出行']:
    X_train_backup[f'{col}_word2vec'] = X_train[col].apply(lambda x: get_average_word2vec(x, word_vectors, 100))
    X_test_backup[f'{col}_word2vec'] = X_test[col].apply(lambda x: get_average_word2vec(x, word_vectors, 100))
    X_predict_backup[f'{col}_word2vec'] = X_predict[col].apply(lambda x: get_average_word2vec(x, word_vectors, 100))

# 拆分嵌入向量为独立的列
word2vec_features = ['核心卖点_word2vec', '户型介绍_word2vec', '周边配套_word2vec', '交通出行_word2vec']

for feature in word2vec_features:
    # 对训练集进行处理
    vectors_train = pd.DataFrame(X_train_backup[feature].tolist(), index=X_train_backup.index)
    vectors_train.columns = [f'{feature}_{i}' for i in range(100)]  # 命名每个列
    X_train_cleaned = pd.concat([X_train_cleaned, vectors_train], axis=1)
    
    # 对测试集进行处理
    vectors_test = pd.DataFrame(X_test_backup[feature].tolist(), index=X_test_backup.index)
    vectors_test.columns = [f'{feature}_{i}' for i in range(100)]  # 命名每个列
    X_test_cleaned = pd.concat([X_test_cleaned, vectors_test], axis=1)

    vectors_predict = pd.DataFrame(X_predict_backup[feature].tolist(), index=X_predict_backup.index)
    vectors_predict.columns = [f'{feature}_{i}' for i in range(100)]  # 命名每个列
    X_predict_cleaned = pd.concat([X_predict_cleaned, vectors_predict], axis=1)


In [22]:
# 定义需要虚拟变量化的定性变量
categorical_columns = [
    '环线', '建筑结构', '装修情况', '配备电梯',
    '别墅类型', '交易权属', '房屋用途', '产权所属'
]

# 对训练集和测试集的指定列进行虚拟变量化，并替换原列
for col in categorical_columns:
    # 对训练集进行虚拟变量化，并用虚拟变量替换原列
    dummies_train = pd.get_dummies(X_train_cleaned[col], prefix=col, drop_first=True)
    X_train_cleaned = pd.concat([X_train_cleaned.drop(columns=[col]), dummies_train], axis=1)
    
    # 对测试集进行虚拟变量化，并用虚拟变量替换原列
    dummies_test = pd.get_dummies(X_test_cleaned[col], prefix=col, drop_first=True)
    X_test_cleaned = pd.concat([X_test_cleaned.drop(columns=[col]), dummies_test], axis=1)

    dummies_predict = pd.get_dummies(X_predict_cleaned[col], prefix=col, drop_first=True)
    X_predict_cleaned = pd.concat([X_predict_cleaned.drop(columns=[col]), dummies_predict], axis=1)


# 保证测试集和训练集列一致（如果有不匹配列，测试集中缺少的列用0填充）
X_test_cleaned = X_test_cleaned.reindex(columns=X_train_cleaned.columns, fill_value=0)
X_predict_cleaned = X_predict_cleaned.reindex(columns=X_train_cleaned.columns, fill_value=0)


In [23]:
for col in ['lon', 'lat', '年份']:
    X_train_cleaned[col] = X_train[col]
    X_test_cleaned[col] = X_test[col]
    X_predict_cleaned[col] = X_predict[col]


In [24]:
# 1. 去掉"建筑面积"和"套内面积"中的 "㎡" 并转换为浮点数
X_train_cleaned['建筑面积'] = X_train['建筑面积'].astype(str).str.replace('㎡', '').astype(float)
X_train_cleaned['套内面积'] = X_train['套内面积'].astype(str).str.replace('㎡', '').astype(float)

X_test_cleaned['建筑面积'] = X_test['建筑面积'].astype(str).str.replace('㎡', '').astype(float)
X_test_cleaned['套内面积'] = X_test['套内面积'].astype(str).str.replace('㎡', '').astype(float)

X_predict_cleaned['建筑面积'] = X_predict['建筑面积'].astype(str).str.replace('㎡', '').astype(float)
X_predict_cleaned['套内面积'] = X_predict['套内面积'].astype(str).str.replace('㎡', '').astype(float)

# 2. 在训练集中计算套内面积与建筑面积的比例
X_train_cleaned['面积比例'] = X_train_cleaned['套内面积'] / X_train_cleaned['建筑面积']

# 3. 计算训练集的平均比例
average_ratio = X_train_cleaned['面积比例'].mean()

# 4. 使用建筑面积乘以训练集的平均比例填充训练集和测试集中的缺失套内面积

# 填充训练集中的缺失套内面积
X_train_cleaned['套内面积'] = X_train_cleaned.apply(
    lambda row: row['建筑面积'] * average_ratio if pd.isnull(row['套内面积']) else row['套内面积'], axis=1
)

# 填充测试集中的缺失套内面积
X_test_cleaned['套内面积'] = X_test_cleaned.apply(
    lambda row: row['建筑面积'] * average_ratio if pd.isnull(row['套内面积']) else row['套内面积'], axis=1
)

X_predict_cleaned['套内面积'] = X_predict_cleaned.apply(
    lambda row: row['建筑面积'] * average_ratio if pd.isnull(row['套内面积']) else row['套内面积'], axis=1
)

# 5. 删除临时的面积比例列
X_train_cleaned.drop(columns=['面积比例'], inplace=True)



In [25]:
# 1. 定义提取房屋户型信息的函数
def extract_room_info(house_type):
    # 检查输入是否为字符串，非字符串的情况返回默认值
    if not isinstance(house_type, str):
        return {'室': 0, '厅': 0, '厨': 0, '卫': 0}
    
    # 使用正则表达式提取室、厅、厨、卫的数据
    room_data = re.findall(r'(\d+)室|(\d+)厅|(\d+)厨|(\d+)卫', house_type)
    
    # 默认初始值为0
    room_count = {'室': 0, '厅': 0, '厨': 0, '卫': 0}
    
    # 提取匹配的数据
    for match in room_data:
        for i, category in enumerate(['室', '厅', '厨', '卫']):
            if match[i]:
                room_count[category] = int(match[i])
    
    return room_count

# 2. 处理训练集的房屋户型列
# 将房屋户型中的缺失值填充为空字符串，避免正则表达式报错
X_train['房屋户型'] = X_train['房屋户型'].fillna('')

# 创建新列，将提取出的数据放入新列
X_train_cleaned['室'] = X_train['房屋户型'].apply(lambda x: extract_room_info(x)['室'])
X_train_cleaned['厅'] = X_train['房屋户型'].apply(lambda x: extract_room_info(x)['厅'])
X_train_cleaned['厨'] = X_train['房屋户型'].apply(lambda x: extract_room_info(x)['厨'])
X_train_cleaned['卫'] = X_train['房屋户型'].apply(lambda x: extract_room_info(x)['卫'])

# 3. 填充训练集中的缺失值
# 对“室”和“厅”使用训练集的中位数填充
X_train_cleaned['室'] = X_train_cleaned['室'].fillna(X_train_cleaned['室'].median())
X_train_cleaned['厅'] = X_train_cleaned['厅'].fillna(X_train_cleaned['厅'].median())

# 对“厨”和“卫”使用常见值（如1）填充
X_train_cleaned['厨'] = X_train_cleaned['厨'].fillna(1)
X_train_cleaned['卫'] = X_train_cleaned['卫'].fillna(1)

# 4. 处理测试集的房屋户型列，使用训练集的数据处理测试集
# 填充测试集中的缺失值，避免报错
X_test['房屋户型'] = X_test['房屋户型'].fillna('')
X_predict['房屋户型'] = X_predict['房屋户型'].fillna('')

# 创建新列，将提取出的数据放入新列
for col in ['室','厅','厨','卫']:
    X_test_cleaned[col] = X_test['房屋户型'].apply(lambda x: extract_room_info(x)[col])  
    X_predict_cleaned[col] = X_predict['房屋户型'].apply(lambda x: extract_room_info(x)[col])  

# 5. 使用训练集的中位数和常见值填充测试集的缺失值
X_test_cleaned['室'] = X_test_cleaned['室'].fillna(X_train_cleaned['室'].median())
X_test_cleaned['厅'] = X_test_cleaned['厅'].fillna(X_train_cleaned['厅'].median())
X_test_cleaned['厨'] = X_test_cleaned['厨'].fillna(1)  # 使用常见值1填充
X_test_cleaned['卫'] = X_test_cleaned['卫'].fillna(1)  # 使用常见值1填充

X_predict_cleaned['室'] = X_predict_cleaned['室'].fillna(X_train_cleaned['室'].median())
X_predict_cleaned['厅'] = X_predict_cleaned['厅'].fillna(X_train_cleaned['厅'].median())
X_predict_cleaned['厨'] = X_predict_cleaned['厨'].fillna(1)  # 使用常见值1填充
X_predict_cleaned['卫'] = X_predict_cleaned['卫'].fillna(1)  # 使用常见值1填充

# 查看结果
print(X_train_cleaned[['室', '厅', '厨', '卫']].head())
print(X_test_cleaned[['室', '厅', '厨', '卫']].head())
print(X_predict_cleaned[['室', '厅', '厨', '卫']].head())


In [26]:
print(f"X_train行数: {len(X_train)}, X_predict行数: {len(X_predict)}")
print(f"X_train_cleaned行数: {len(X_train_cleaned)}, X_predict_cleaned行数: {len(X_predict_cleaned)}")

In [27]:
# 1. 定义处理楼层信息的函数
def process_floor(floor_info):
    # 正则表达式提取楼层描述和总层数
    match = re.match(r'(低楼层|中楼层|高楼层|地下室|顶层|底层) \(共(\d+)层\)', floor_info)
    
    if match:
        floor_type = match.group(1)  # 获取楼层类型（高、中、低、地下室）
        total_floors = int(match.group(2))  # 总层数
        
        if floor_type == '高楼层':
            return total_floors * 5 / 6, total_floors
        elif floor_type == '中楼层':
            return total_floors * 3 / 6, total_floors
        elif floor_type == '低楼层':
            return total_floors * 1 / 6, total_floors
        elif floor_type == '顶层':
            return total_floors, total_floors
        elif floor_type == '底层':
            return 1, total_floors
        elif floor_type == '地下室':
            return -1, total_floors  # 地下室直接返回-1
    else:
        return None, None  # 如果数据格式不符合预期，返回None

# 2. 处理训练集的“所在楼层”列
# 应用处理函数，提取楼层数值和总层数
X_train_cleaned['楼层数值'], X_train_cleaned['总层数'] = zip(*X_train['所在楼层'].apply(process_floor))

# 3. 处理测试集的“所在楼层”列
# 应用处理函数，提取楼层数值和总层数
X_test_cleaned['楼层数值'], X_test_cleaned['总层数'] = zip(*X_test['所在楼层'].apply(process_floor))
X_predict_cleaned['楼层数值'], X_predict_cleaned['总层数'] = zip(*X_predict['所在楼层'].apply(process_floor))


In [28]:
# 1. 定义中文数字到阿拉伯数字的映射
chinese_num_map = {
    '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
    '十': 10, '百': 100, '千': 1000, '万': 10000
}

# 2. 将中文数字转换为阿拉伯数字
def chinese_to_arabic(chinese_digits):
    result = 0
    unit = 1
    for char in reversed(chinese_digits):
        if char in chinese_num_map:
            num = chinese_num_map[char]
            if num >= 10:  # 是十、百、千等倍数
                unit = num
            else:
                result += num * unit
        else:
            unit = 1  # 重置倍数单位
    return result

# 3. 处理“梯户比例”列，提取中文的梯数和户数，并计算比例
def extract_elevator_house_ratio(ratio_info):
    # 使用正则表达式提取梯数和户数，中文格式如“一梯一千零二十二户”
    match = re.match(r'([零一二三四五六七八九十百千万]+)梯([零一二三四五六七八九十百千万]+)户', ratio_info)
    
    if match:
        elevators = chinese_to_arabic(match.group(1))  # 将中文梯数转换为阿拉伯数字
        households = chinese_to_arabic(match.group(2))  # 将中文户数转换为阿拉伯数字
        if elevators > 0:
            return households / elevators  # 计算梯户比例
    return None  # 如果格式不匹配或梯数为0，则返回None

# 4. 处理训练集的“梯户比例”列
X_train_cleaned['梯户比例'] = X_train['梯户比例'].apply(
    lambda x: extract_elevator_house_ratio(x) if isinstance(x, str) else None
)

# 5. 计算训练集中梯户比例的均值（用于填补缺失值）
train_mean_ratio = X_train_cleaned['梯户比例'].mean()

# 6. 处理测试集的“梯户比例”列，使用训练集的均值填补缺失值
X_test_cleaned['梯户比例'] = X_test['梯户比例'].apply(
    lambda x: extract_elevator_house_ratio(x) if isinstance(x, str) else None
)

X_predict_cleaned['梯户比例'] = X_predict['梯户比例'].apply(
    lambda x: extract_elevator_house_ratio(x) if isinstance(x, str) else None
)

# 7. 填补训练集中的缺失值
X_train_cleaned['梯户比例'] = X_train_cleaned['梯户比例'].fillna(train_mean_ratio)

# 8. 填补测试集中的缺失值，使用训练集的均值
X_test_cleaned['梯户比例'] = X_test_cleaned['梯户比例'].fillna(train_mean_ratio)
X_predict_cleaned['梯户比例'] = X_predict_cleaned['梯户比例'].fillna(train_mean_ratio)



In [29]:
# 查看处理后的结果
print(X_train_cleaned.head())
print(X_test_cleaned.head())
print(X_predict_cleaned.head())

In [30]:
# 通用函数定义
def extract_number(text, suffix=''):
    if not isinstance(text, str):
        return None
    # 移除后缀并提取数字
    if suffix and text.endswith(suffix):
        text = text[:-len(suffix)]
    # 尝试提取数字
    try:
        return float(re.sub(r'[^\d.]', '', text))
    except:
        return None

def extract_year(year_info):
    if not isinstance(year_info, str):
        return None
    
    # 提取年份范围（如"1970-1979年"）
    range_match = re.match(r'(\d{4})-(\d{4})年', year_info)
    if range_match:
        start_year = int(range_match.group(1))
        end_year = int(range_match.group(2))
        return (start_year + end_year) // 2  # 取中点年份
    
    # 提取具体年份（如"2010年"）
    exact_match = re.match(r'(\d{4})年', year_info)
    if exact_match:
        return int(exact_match.group(1))
    
    return None


def match_community_features_fixed(df, community_features, community_columns):
    """
    将小区特征匹配到数据框中，确保不增加行数
    
    参数:
    df - 要添加特征的数据框
    community_features - 包含小区特征的数据框
    community_columns - 要匹配的特征列名列表
    
    返回:
    添加了特征的数据框，保持原始行数和索引
    """
    # 创建结果DataFrame，直接复制原始DataFrame
    result_df = df.copy()
    
    # 为每个要添加的特征列创建默认值（使用中位数）
    for col in community_columns:
        # 先在结果DataFrame中创建列，使用NaN初始化
        result_df[col] = np.nan
    
    # 1. 直接遍历每一行，进行精确匹配
    for idx, row in result_df.iterrows():
        # 在community_features中查找匹配的记录
        matched_records = community_features[
            (community_features['城市'] == row['城市']) & 
            (community_features['名称'] == row['小区名称'])
        ]
        
        if len(matched_records) > 0:
            # 如果找到匹配，使用第一条记录的值（避免多条匹配）
            for col in community_columns:
                if col in matched_records.columns:
                    result_df.loc[idx, col] = matched_records.iloc[0][col]
    
    # 2. 对仍然有缺失值的记录，使用经纬度找最近的记录
    missing_mask = result_df[community_columns[0]].isna()
    missing_count = missing_mask.sum()
    
    if missing_count > 0:
        print(f"精确匹配后仍有 {missing_count} 条记录需要通过经纬度匹配")
        
        # 筛选出有完整特征的记录作为参考
        reference = community_features.dropna(subset=community_columns)
        
        # 创建BallTree加速最近邻搜索（如果行数较多）
        if len(reference) > 1000:
            try:
                from sklearn.neighbors import BallTree
                coords = np.radians(reference[['coord_y', 'coord_x']].values)
                tree = BallTree(coords, metric='haversine')
                use_balltree = True
                print("使用BallTree加速搜索")
            except:
                use_balltree = False
                print("无法使用BallTree，将使用逐行计算")
        else:
            use_balltree = False
        
        # 遍历所有缺失值记录
        for idx in result_df[missing_mask].index:
            row = result_df.loc[idx]
            
            # 对于没有经纬度的记录，跳过
            if pd.isna(row['lon']) or pd.isna(row['lat']):
                continue
                
            # 筛选同城市的小区
            city_communities = reference[reference['城市'] == row['城市']]
            
            if len(city_communities) > 0:
                if use_balltree:
                    # 使用BallTree查找最近的小区
                    query_point = np.radians([[row['lat'], row['lon']]])
                    dist, indices = tree.query(query_point, k=1)
                    nearest = reference.iloc[indices[0][0]]
                else:
                    # 计算欧氏距离
                    city_communities['distance'] = np.sqrt(
                        (city_communities['coord_x'] - row['lon'])**2 + 
                        (city_communities['coord_y'] - row['lat'])**2
                    )
                    
                    # 找到距离最小的记录
                    nearest_idx = city_communities['distance'].idxmin()
                    nearest = city_communities.loc[nearest_idx]
                
                # 更新缺失的特征值
                for col in community_columns:
                    result_df.loc[idx, col] = nearest[col]
    
    # 3. 确保没有任何列丢失
    for col in df.columns:
        if col not in result_df.columns:
            result_df[col] = df[col]
    
    # 4. 最终检查缺失值，使用全局中位数填充
    for col in community_columns:
        if result_df[col].isna().any():
            # 计算全局中位数
            global_median = community_features[col].median()
            # 填充缺失值
            result_df[col] = result_df[col].fillna(global_median)
            print(f"列 {col} 使用全局中位数 {global_median} 填充了 {result_df[col].isna().sum()} 个缺失值")
    
    # 5. 验证结果行数
    if len(result_df) != len(df):
        print(f"警告: 函数返回行数 {len(result_df)} 与输入行数 {len(df)} 不一致，这不应该发生！")
    
    return result_df

# 处理小区详情数据集
details_processed = details.copy()


# 处理建筑年代
details_processed['建筑年代数值'] = details_processed['建筑年代'].apply(extract_year)

# 计算板块和城市的建筑年代中位数
plate_year_median = details_processed.groupby(['城市', '板块'])['建筑年代数值'].median().reset_index()
plate_year_median.columns = ['城市', '板块', '板块建筑年代中位数']

city_year_median = details_processed.groupby('城市')['建筑年代数值'].median().reset_index()
city_year_median.columns = ['城市', '城市建筑年代中位数']

# 计算全局建筑年代中位数
global_year_median = details_processed['建筑年代数值'].median()

# 将中位数合并回details数据集并填充缺失值
details_processed = pd.merge(details_processed, plate_year_median, on=['城市', '板块'], how='left')
details_processed = pd.merge(details_processed, city_year_median, on='城市', how='left')

details_processed['建筑年代数值'] = details_processed['建筑年代数值'].fillna(details_processed['板块建筑年代中位数'])
details_processed['建筑年代数值'] = details_processed['建筑年代数值'].fillna(details_processed['城市建筑年代中位数'])
details_processed['建筑年代数值'] = details_processed['建筑年代数值'].fillna(global_year_median)

# 确定当前年份并计算楼龄
current_year = max(X_train['年份'].max(), X_test['年份'].max(), X_predict['年份'].max())
details_processed['楼龄'] = current_year - details_processed['建筑年代数值']

# 准备小区特征数据框
community_features = details_processed[['城市', '名称', 'coord_x', 'coord_y', '楼龄']]

# 处理建筑年代特征
year_columns = ['楼龄']
print("开始处理建筑年代特征...")
print(f"处理前训练集行数: {len(X_train_cleaned)}")
print(f"处理前测试集行数: {len(X_test_cleaned)}")
print(f"处理前预测集行数: {len(X_predict_cleaned)}")

X_train_cleaned = match_community_features_fixed(X_train_cleaned, community_features, year_columns)
X_test_cleaned = match_community_features_fixed(X_test_cleaned, community_features, year_columns)
X_predict_cleaned = match_community_features_fixed(X_predict_cleaned, community_features, year_columns)

print(f"处理后训练集行数: {len(X_train_cleaned)}")
print(f"处理后测试集行数: {len(X_test_cleaned)}")
print(f"处理后预测集行数: {len(X_predict_cleaned)}")


In [31]:
# 处理房屋总数和楼栋总数
details_processed['房屋总数数值'] = details_processed['房屋总数'].apply(lambda x: extract_number(x, '户'))
details_processed['楼栋总数数值'] = details_processed['楼栋总数'].apply(lambda x: extract_number(x, '栋'))

# 准备要匹配的小区特征
community_features = details_processed[['城市', '名称', 'coord_x', 'coord_y', 
                                      '房屋总数数值', '楼栋总数数值']].copy()

# 处理房屋总数和楼栋总数特征
housing_columns = ['房屋总数数值', '楼栋总数数值']

# 保存原始长度，用于验证
train_len_before = len(X_train_cleaned)
test_len_before = len(X_test_cleaned)
predict_len_before = len(X_predict_cleaned)

X_train_cleaned = match_community_features_fixed(X_train_cleaned, community_features, housing_columns)
X_test_cleaned = match_community_features_fixed(X_test_cleaned, community_features, housing_columns)
X_predict_cleaned = match_community_features_fixed(X_predict_cleaned, community_features, housing_columns)

# 验证长度是否变化
print(f"处理房屋总数和楼栋总数前后长度变化:")
print(f"训练集: {train_len_before} -> {len(X_train_cleaned)}")
print(f"测试集: {test_len_before} -> {len(X_test_cleaned)}")
print(f"预测集: {predict_len_before} -> {len(X_predict_cleaned)}")

In [32]:
# 处理绿化率
details_processed['绿化率数值'] = details_processed['绿 化 率'].apply(lambda x: extract_number(x, '%'))

# 计算板块和城市的绿化率中位数
plate_green_median = details_processed.groupby(['城市', '板块'])['绿化率数值'].median().reset_index()
plate_green_median.columns = ['城市', '板块', '板块绿化率中位数']

city_green_median = details_processed.groupby('城市')['绿化率数值'].median().reset_index()
city_green_median.columns = ['城市', '城市绿化率中位数']

# 计算全局绿化率中位数
global_green_median = details_processed['绿化率数值'].median()

# 将中位数合并回details数据集并填充缺失值
details_processed = pd.merge(details_processed, plate_green_median, on=['城市', '板块'], how='left')
details_processed = pd.merge(details_processed, city_green_median, on='城市', how='left')

details_processed['绿化率数值'] = details_processed['绿化率数值'].fillna(details_processed['板块绿化率中位数'])
details_processed['绿化率数值'] = details_processed['绿化率数值'].fillna(details_processed['城市绿化率中位数'])
details_processed['绿化率数值'] = details_processed['绿化率数值'].fillna(global_green_median)

# 处理容积率 - 该列已经是数值类型，只需处理缺失值
# 计算板块和城市的容积率中位数
plate_vol_median = details_processed.groupby(['城市', '板块'])['容 积 率'].median().reset_index()
plate_vol_median.columns = ['城市', '板块', '板块容积率中位数']

city_vol_median = details_processed.groupby('城市')['容 积 率'].median().reset_index()
city_vol_median.columns = ['城市', '城市容积率中位数']

# 计算全局容积率中位数
global_vol_median = details_processed['容 积 率'].median()

# 将中位数合并回details数据集并填充缺失值
details_processed = pd.merge(details_processed, plate_vol_median, on=['城市', '板块'], how='left')
details_processed = pd.merge(details_processed, city_vol_median, on='城市', how='left')

details_processed['容积率数值'] = details_processed['容 积 率'].fillna(details_processed['板块容积率中位数'])
details_processed['容积率数值'] = details_processed['容积率数值'].fillna(details_processed['城市容积率中位数'])
details_processed['容积率数值'] = details_processed['容积率数值'].fillna(global_vol_median)

# 更新小区特征数据框，加入绿化率和容积率
community_features = community_features.copy()
community_features['绿化率数值'] = details_processed['绿化率数值']
community_features['容积率数值'] = details_processed['容积率数值']

# 处理绿化率和容积率特征
rate_columns = ['绿化率数值', '容积率数值']

# 保存原始长度，用于验证
train_len_before = len(X_train_cleaned)
test_len_before = len(X_test_cleaned)
predict_len_before = len(X_predict_cleaned)

X_train_cleaned = match_community_features_fixed(X_train_cleaned, community_features, rate_columns)
X_test_cleaned = match_community_features_fixed(X_test_cleaned, community_features, rate_columns)
X_predict_cleaned = match_community_features_fixed(X_predict_cleaned, community_features, rate_columns)

# 验证长度是否变化
print(f"处理绿化率容积率前后长度变化:")
print(f"训练集: {train_len_before} -> {len(X_train_cleaned)}")
print(f"测试集: {test_len_before} -> {len(X_test_cleaned)}")
print(f"预测集: {predict_len_before} -> {len(X_predict_cleaned)}")

In [33]:
# 处理停车位
# 按房屋总数规模分组，用于填充停车位缺失值
def categorize_community_size(house_count):
    """根据房屋总数定义小区规模"""
    if pd.isna(house_count):
        return None
    elif house_count < 100:
        return "小型"
    elif house_count < 500:
        return "中型"
    else:
        return "大型"

# 创建小区规模字段
details_processed['小区规模'] = details_processed['房屋总数数值'].apply(categorize_community_size)

# 按小区规模和城市分组计算停车位中位数
size_parking_median = details_processed.groupby(['城市', '小区规模'])['停车位'].median().reset_index()
size_parking_median.columns = ['城市', '小区规模', '规模停车位中位数']

# 计算城市停车位中位数
city_parking_median = details_processed.groupby('城市')['停车位'].median().reset_index()
city_parking_median.columns = ['城市', '城市停车位中位数']

# 计算全局停车位中位数
global_parking_median = details_processed['停车位'].median()

# 将中位数合并回details数据集并填充缺失值
details_processed = pd.merge(details_processed, size_parking_median, on=['城市', '小区规模'], how='left')
details_processed = pd.merge(details_processed, city_parking_median, on='城市', how='left')

details_processed['停车位数值'] = details_processed['停车位'].fillna(details_processed['规模停车位中位数'])
details_processed['停车位数值'] = details_processed['停车位数值'].fillna(details_processed['城市停车位中位数'])
details_processed['停车位数值'] = details_processed['停车位数值'].fillna(global_parking_median)


# 处理停车费用
def extract_parking_fee(fee_info):
    if not isinstance(fee_info, str):
        return None
    
    # 如果是"暂无"或类似无意义值，返回None
    if fee_info in ['暂无', '无', '-', '/', '——', '免费']:
        return 0
    
    # 尝试提取数值
    try:
        # 提取所有数字
        numbers = re.findall(r'\d+', fee_info)
        if numbers:
            # 如果有多个数字，取第一个
            return float(numbers[0])
        return None
    except:
        return None

details_processed['停车费用数值'] = details_processed['停车费用'].apply(extract_parking_fee)

# 按小区规模和城市分组计算停车费中位数
size_fee_median = details_processed.groupby(['城市', '小区规模'])['停车费用数值'].median().reset_index()
size_fee_median.columns = ['城市', '小区规模', '规模停车费中位数']

# 计算城市停车费中位数
city_fee_median = details_processed.groupby('城市')['停车费用数值'].median().reset_index()
city_fee_median.columns = ['城市', '城市停车费中位数']

# 计算全局停车费中位数
global_fee_median = details_processed['停车费用数值'].median()

# 将中位数合并回details数据集并填充缺失值
details_processed = pd.merge(details_processed, size_fee_median, on=['城市', '小区规模'], how='left')
details_processed = pd.merge(details_processed, city_fee_median, on='城市', how='left')

details_processed['停车费用数值'] = details_processed['停车费用数值'].fillna(details_processed['规模停车费中位数'])
details_processed['停车费用数值'] = details_processed['停车费用数值'].fillna(details_processed['城市停车费中位数'])
details_processed['停车费用数值'] = details_processed['停车费用数值'].fillna(global_fee_median)

# 更新小区特征数据框，加入停车位和停车费
community_features = community_features.copy()
community_features['停车位数值'] = details_processed['停车位数值']
community_features['停车费用数值'] = details_processed['停车费用数值']

# 处理停车位和停车费特征
parking_columns = ['停车位数值', '停车费用数值']

# 保存原始长度，用于验证
train_len_before = len(X_train_cleaned)
test_len_before = len(X_test_cleaned)
predict_len_before = len(X_predict_cleaned)

X_train_cleaned = match_community_features_fixed(X_train_cleaned, community_features, parking_columns)
X_test_cleaned = match_community_features_fixed(X_test_cleaned, community_features, rate_columns)
X_predict_cleaned = match_community_features_fixed(X_predict_cleaned, community_features, rate_columns)

# 验证长度是否变化
print(f"处理停车位停车费前后长度变化:")
print(f"训练集: {train_len_before} -> {len(X_train_cleaned)}")
print(f"测试集: {test_len_before} -> {len(X_test_cleaned)}")
print(f"预测集: {predict_len_before} -> {len(X_predict_cleaned)}")

In [34]:
# 1. 处理租价数据，计算每平米租金
def extract_area(area_str):
    """从面积字符串中提取数值"""
    if not isinstance(area_str, str):
        return None
    
    # 提取数字部分
    try:
        return float(re.sub(r'[^\d.]', '', area_str))
    except:
        return None

# 提取租赁数据中的面积数值
rent['面积数值'] = rent['面积'].apply(extract_area)

# 计算每平米月租金
rent['每平米月租金'] = rent.apply(
    lambda row: row['价格'] / row['面积数值'] if pd.notna(row['面积数值']) and row['面积数值'] > 0 else None,
    axis=1
)

# 2. 按小区分组计算每平米租金中位数
# 先按城市和小区名称分组
rent_by_community = rent.groupby(['城市', '小区名称'])['每平米月租金'].median().reset_index()
rent_by_community.columns = ['城市', '小区名称', '每平米月租金中位数']

# 3. 添加经纬度信息用于后续匹配
# 从租赁数据中提取经纬度信息
community_coords_rent = rent.groupby(['城市', '小区名称']).agg({
    'lon': 'mean',
    'lat': 'mean'
}).reset_index()

# 合并经纬度信息到特征数据框
rent_features = pd.merge(
    rent_by_community,
    community_coords_rent,
    on=['城市', '小区名称'],
    how='left'
)

# 4. 处理租金特征的缺失值
# 按城市分组计算中位数
city_rent_medians = rent_features.groupby('城市')['每平米月租金中位数'].median().reset_index()
city_rent_medians.columns = ['城市', '城市每平米月租金中位数']

# 计算全局中位数
global_rent_median = rent_features['每平米月租金中位数'].median()

# 填充缺失值
rent_features = pd.merge(rent_features, city_rent_medians, on='城市', how='left')
rent_features['每平米月租金中位数'] = rent_features['每平米月租金中位数'].fillna(rent_features['城市每平米月租金中位数'])
rent_features['每平米月租金中位数'] = rent_features['每平米月租金中位数'].fillna(global_rent_median)

# 确保经纬度列名一致
rent_features = rent_features[['城市', '小区名称', 'lon', 'lat', '每平米月租金中位数']]

# 5. 使用之前定义的匹配函数进行特征合并
# 修改match_community_features函数以适应当前使用场景
def match_community_features(df, community_features, community_column):
    """
    将小区特征匹配到数据框中，使用经纬度找最近的小区
    
    参数:
    df - 要添加特征的数据框
    community_features - 包含小区特征的数据框
    community_column - 要匹配的特征列名
    
    返回:
    添加了特征的数据框
    """
    # 创建结果数据框的副本，以免修改原始数据
    result_df = df.copy()
    
    # 1. 先按城市和小区名称精确匹配
    matched_df = pd.merge(
        result_df, 
        community_features[['城市', '小区名称', community_column]], 
        left_on=['城市', '小区名称'], 
        right_on=['城市', '小区名称'], 
        how='left'
    )
    
    # 2. 对未匹配上的记录，用经纬度找最近的小区
    missing_mask = matched_df[community_column].isna()
    
    if missing_mask.any():
        # 筛选出有完整特征的记录作为参考
        reference = community_features.dropna(subset=[community_column, 'lon', 'lat'])
        
        # 对每个未匹配的记录找最近的小区
        for idx in matched_df[missing_mask].index:
            row = matched_df.loc[idx]
            
            # 筛选同城市的小区
            city_communities = reference[reference['城市'] == row['城市']]
            
            if len(city_communities) > 0:
                # 计算欧氏距离
                city_communities['distance'] = np.sqrt(
                    (city_communities['lon'] - row['lon'])**2 + 
                    (city_communities['lat'] - row['lat'])**2
                )
                
                # 找到距离最小的记录
                nearest_idx = city_communities['distance'].idxmin()
                nearest = city_communities.loc[nearest_idx]
                
                # 更新缺失的特征值
                matched_df.loc[idx, community_column] = nearest[community_column]
    
    # 3. 对仍然缺失的值，使用城市级别中位数填充
    still_missing = matched_df[community_column].isna()
    
    if still_missing.any():
        for city in matched_df['城市'].unique():
            if city in city_rent_medians['城市'].values:
                city_median = city_rent_medians[city_rent_medians['城市'] == city]['城市每平米月租金中位数'].values[0]
                city_mask = (matched_df['城市'] == city) & matched_df[community_column].isna()
                matched_df.loc[city_mask, community_column] = city_median
    
    # 4. 最后使用全局中位数填充剩余缺失值
    matched_df[community_column] = matched_df[community_column].fillna(global_rent_median)
    
    return matched_df

# 6. 应用匹配函数
print("开始处理训练集...")
X_train_cleaned = match_community_features(X_train_cleaned, rent_features, '每平米月租金中位数')

print("开始处理测试集...")
X_test_cleaned = match_community_features(X_test_cleaned, rent_features, '每平米月租金中位数')

print("开始处理预测集...")
X_predict_cleaned = match_community_features(X_predict_cleaned, rent_features, '每平米月租金中位数')


In [35]:
print(f"原始X_predict行数: {len(X_train)}")
print(f"处理后X_predict_cleaned行数: {len(X_train_cleaned)}")

In [36]:
# 设置新数据框临时存储数据
df_train_time = pd.DataFrame()
df_test_time = pd.DataFrame()
df_predict_time = pd.DataFrame()

# 定义周期性转换函数
def create_cyclic_features(df, column, period):
    df[f'{column}_sin'] = np.sin(2 * np.pi * df[column] / period)
    df[f'{column}_cos'] = np.cos(2 * np.pi * df[column] / period)

# 转换“交易时间”和“上次交易”列为日期格式
X_train['交易时间'] = pd.to_datetime(X_train['交易时间'], errors='coerce')
X_test['交易时间'] = pd.to_datetime(X_test['交易时间'], errors='coerce')
X_predict['交易时间'] = pd.to_datetime(X_predict['交易时间'], errors='coerce')

X_train['上次交易'] = pd.to_datetime(X_train['上次交易'], errors='coerce')
X_test['上次交易'] = pd.to_datetime(X_test['上次交易'], errors='coerce')
X_predict['上次交易'] = pd.to_datetime(X_predict['上次交易'], errors='coerce')

# 提取时间特征（交易时间）
df_train_time['交易年份'] = X_train['交易时间'].dt.year
df_train_time['交易月份'] = X_train['交易时间'].dt.month
df_train_time['交易季度'] = X_train['交易时间'].dt.quarter

df_test_time['交易年份'] = X_test['交易时间'].dt.year
df_test_time['交易月份'] = X_test['交易时间'].dt.month
df_test_time['交易季度'] = X_test['交易时间'].dt.quarter

df_predict_time['交易年份'] = X_predict['交易时间'].dt.year
df_predict_time['交易月份'] = X_predict['交易时间'].dt.month
df_predict_time['交易季度'] = X_predict['交易时间'].dt.quarter

# 对月份和季度进行周期性转换
create_cyclic_features(df_train_time, '交易月份', 12)
create_cyclic_features(df_train_time, '交易季度', 4)

create_cyclic_features(df_test_time, '交易月份', 12)
create_cyclic_features(df_test_time, '交易季度', 4)

create_cyclic_features(df_predict_time, '交易月份', 12)
create_cyclic_features(df_predict_time, '交易季度', 4)

# 处理上次交易列，生成是否有上次交易记录的标记列
df_train_time['有上次交易'] = X_train['上次交易'].notnull().astype(int)
df_test_time['有上次交易'] = X_test['上次交易'].notnull().astype(int)
df_predict_time['有上次交易'] = X_predict['上次交易'].notnull().astype(int)

# 对于有交易记录的行，继续提取和转换时间特征
df_train_time['上次交易年份'] = X_train['上次交易'].dt.year.fillna(0)  # 无交易填充为0
df_train_time['上次交易月份'] = X_train['上次交易'].dt.month.fillna(0)
df_train_time['上次交易季度'] = X_train['上次交易'].dt.quarter.fillna(0)

df_test_time['上次交易年份'] = X_test['上次交易'].dt.year.fillna(0)
df_test_time['上次交易月份'] = X_test['上次交易'].dt.month.fillna(0)
df_test_time['上次交易季度'] = X_test['上次交易'].dt.quarter.fillna(0)

df_predict_time['上次交易年份'] = X_predict['上次交易'].dt.year.fillna(0)
df_predict_time['上次交易月份'] = X_predict['上次交易'].dt.month.fillna(0)
df_predict_time['上次交易季度'] = X_predict['上次交易'].dt.quarter.fillna(0)

# 对上次交易的月份和季度进行周期性转换，排除无交易记录的行
create_cyclic_features(df_train_time, '上次交易月份', 12)
create_cyclic_features(df_train_time, '上次交易季度', 4)

create_cyclic_features(df_test_time, '上次交易月份', 12)
create_cyclic_features(df_test_time, '上次交易季度', 4)

create_cyclic_features(df_predict_time, '上次交易月份', 12)
create_cyclic_features(df_predict_time, '上次交易季度', 4)

# 保留与建模相关的列（年份、周期性特征，和是否有上次交易记录的标记）
X_train_cleaned[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
] = df_train_time[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
]

X_test_cleaned[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
] = df_test_time[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
]

X_predict_cleaned[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
] = df_predict_time[
    ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos', 
     '有上次交易', '上次交易年份', '上次交易月份_sin', '上次交易月份_cos', 
     '上次交易季度_sin', '上次交易季度_cos']
]


# 查看处理后的结果
print(X_train_cleaned.head())
print(X_test_cleaned.head())


In [37]:
print(X_train_cleaned.isnull().sum())
print(X_test_cleaned.isnull().sum())
print(X_predict_cleaned.isnull().sum())

In [38]:
# 修复代码 - 处理上次交易和当前交易时间特征的缺失值

# 1. 处理上次交易时间特征的缺失值
for col in ['上次交易年份', '上次交易月份_sin', '上次交易月份_cos', '上次交易季度_sin', '上次交易季度_cos']:
    # 对于已经标记为无上次交易的记录，使用默认值填充
    X_train_cleaned[col] = X_train_cleaned[col].fillna(0)

# 2. 处理当前交易时间特征的缺失值 (这是之前缺失的部分)
for col in ['交易年份', '交易月份_sin', '交易月份_cos', '交易季度_sin', '交易季度_cos']:
    # 使用合适的默认值填充
    if col == '交易年份':
        # 使用数据集的年份中位数填充
        median_year = X_train_cleaned['交易年份'].median()
        X_train_cleaned[col] = X_train_cleaned[col].fillna(median_year)

    elif col.endswith('_cos'):
        # cos(0) = 1 作为默认值，相当于第一个月或第一个季度
        X_train_cleaned[col] = X_train_cleaned[col].fillna(1)

    else:
        # sin(0) = 0 作为默认值
        X_train_cleaned[col] = X_train_cleaned[col].fillna(0)


# 3. 确保有上次交易标记列没有缺失值
X_train_cleaned['有上次交易'] = X_train_cleaned['有上次交易'].fillna(0)


# 4. 检查修复后的结果
print("修复后训练集缺失值:")
print(X_train_cleaned[['有上次交易', '上次交易年份', '上次交易月份_sin', '交易年份', '交易月份_sin', '交易季度_sin']].isnull().sum())
print("\n修复后测试集缺失值:")
print(X_test_cleaned[['有上次交易', '上次交易年份', '上次交易月份_sin', '交易年份', '交易月份_sin', '交易季度_sin']].isnull().sum())
print("\n修复后预测集缺失值:")
print(X_predict_cleaned[['有上次交易', '上次交易年份', '上次交易月份_sin', '交易年份', '交易月份_sin', '交易季度_sin']].isnull().sum())

# 5. 检查全局缺失值情况
print("\n各数据集整体缺失值情况：")
print(f"训练集缺失值总数: {X_train_cleaned.isnull().sum().sum()}")
print(f"测试集缺失值总数: {X_test_cleaned.isnull().sum().sum()}")
print(f"预测集缺失值总数: {X_predict_cleaned.isnull().sum().sum()}")

In [39]:
# 1. 处理分类变量
# 定义需要虚拟变量化的定性变量
categorical_columns = ['城市', '区域', '板块']

# 对训练集和测试集的指定列进行虚拟变量化，并替换原列
for col in categorical_columns:
    # 对训练集进行虚拟变量化，并用虚拟变量替换原列
    dummies_train = pd.get_dummies(X_train_cleaned[col], prefix=col, drop_first=True)
    X_train_cleaned = pd.concat([X_train_cleaned.drop(columns=[col]), dummies_train], axis=1)
    
    # 对测试集进行虚拟变量化，并用虚拟变量替换原列
    dummies_test = pd.get_dummies(X_test_cleaned[col], prefix=col, drop_first=True)
    X_test_cleaned = pd.concat([X_test_cleaned.drop(columns=[col]), dummies_test], axis=1)

    dummies_predict = pd.get_dummies(X_predict_cleaned[col], prefix=col, drop_first=True)
    X_predict_cleaned = pd.concat([X_predict_cleaned.drop(columns=[col]), dummies_predict], axis=1)


# 保证测试集和训练集列一致（如果有不匹配列，测试集中缺少的列用0填充）
X_test_cleaned = X_test_cleaned.reindex(columns=X_train_cleaned.columns, fill_value=0)
X_predict_cleaned = X_predict_cleaned.reindex(columns=X_train_cleaned.columns, fill_value=0)

# 将所有布尔列转换为整数
bool_columns = X_train_cleaned.select_dtypes(include=['bool']).columns
for df in [X_train_cleaned, X_test_cleaned, X_predict_cleaned]:
    df[bool_columns] = df[bool_columns].astype(int)
    
# 删除小区名称列
X_train_cleaned = X_train_cleaned.drop(columns=['小区名称'])
X_test_cleaned = X_test_cleaned.drop(columns=['小区名称'])
X_predict_cleaned = X_predict_cleaned.drop(columns=['小区名称'])

# 查看替换后的数据框结构
print(X_train_cleaned.head())
print(X_test_cleaned.head())
print(X_predict_cleaned.head())

In [40]:
# 基于领域知识创建交互项和多项式特征

# 首先，确保数据中没有重复列名
print("当前数据集中的列数:", len(X_train_cleaned.columns))
print("当前数据集中的唯一列数:", len(set(X_train_cleaned.columns)))

# 由于没有重复列，直接复制DataFrame以避免性能警告
X_train_no_dupes = X_train_cleaned.copy()
X_test_no_dupes = X_test_cleaned.copy()
X_predict_no_dupes = X_predict_cleaned.copy()

# 使用处理后的数据框
X_train_cleaned = X_train_no_dupes
X_test_cleaned = X_test_no_dupes
X_predict_cleaned = X_predict_no_dupes

print("处理后的数据集中的列数:", len(X_train_cleaned.columns))

# 选择用于创建交互项的最重要数值特征
key_features = ['建筑面积', '套内面积', '室', '厅', '卫', '楼龄', '梯户比例',
                '楼层数值', '总层数', '每平米月租金中位数', 'lon', 'lat']

# 过滤掉可能不在数据集中的关键特征
key_features = [f for f in key_features if f in X_train_cleaned.columns]

# 确保所有特征为数值类型并处理缺失值
for feature in key_features:
    for df in [X_train_cleaned, X_test_cleaned, X_predict_cleaned]:
        df[feature] = pd.to_numeric(df[feature], errors='coerce')
        df[feature] = df[feature].fillna(df[feature].median())

if len(key_features) >= 2:
    # 限制特征数量，避免生成太多多项式特征
    if len(key_features) > 4:
        # 选择最重要的特征，这里使用前4个作为示例
        key_features = key_features[:4]
    
    print(f"使用的关键特征: {key_features}")
    
    # 提取关键特征
    X_key_features_train = X_train_cleaned[key_features]
    X_key_features_test = X_test_cleaned[key_features]
    X_key_features_predict = X_predict_cleaned[key_features]
    
    # 创建多项式特征（仅交互项）
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    
    # 转换数据
    poly_features_train = poly.fit_transform(X_key_features_train)
    poly_features_test = poly.transform(X_key_features_test)
    poly_features_predict = poly.transform(X_key_features_predict)
    
    # 获取特征名称
    feature_names = poly.get_feature_names_out(key_features)
    # 添加前缀
    poly_feature_names = [f'poly_{name}' for name in feature_names]
    
    # 创建DataFrame
    poly_features_train_df = pd.DataFrame(poly_features_train, columns=poly_feature_names, index=X_train_cleaned.index)
    poly_features_test_df = pd.DataFrame(poly_features_test, columns=poly_feature_names, index=X_test_cleaned.index)
    poly_features_predict_df = pd.DataFrame(poly_features_predict, columns=poly_feature_names, index=X_predict_cleaned.index)
    
    # 连接特征
    X_train_cleaned = pd.concat([X_train_cleaned, poly_features_train_df], axis=1)
    X_test_cleaned = pd.concat([X_test_cleaned, poly_features_test_df], axis=1)
    X_predict_cleaned = pd.concat([X_predict_cleaned, poly_features_predict_df], axis=1)
    
    print(f"添加了{poly_features_train_df.shape[1]}个多项式和交互特征")

# 添加领域特征，更安全的实现方式
# 每平米价格
if '建筑面积' in X_train_cleaned.columns and '每平米月租金中位数' in X_train_cleaned.columns:
    try:
        X_train_cleaned['租金估值'] = X_train_cleaned['建筑面积'] * X_train_cleaned['每平米月租金中位数']
        X_test_cleaned['租金估值'] = X_test_cleaned['建筑面积'] * X_test_cleaned['每平米月租金中位数']
        X_predict_cleaned['租金估值'] = X_predict_cleaned['建筑面积'] * X_predict_cleaned['每平米月租金中位数']
        print("成功添加：租金估值")
    except Exception as e:
        print(f"添加租金估值时出错: {e}")

# 房间密度
if '室' in X_train_cleaned.columns and '建筑面积' in X_train_cleaned.columns:
    try:
        # 避免除以零
        X_train_cleaned['房间密度'] = X_train_cleaned['室'] / X_train_cleaned['建筑面积'].replace(0, float('nan'))
        X_test_cleaned['房间密度'] = X_test_cleaned['室'] / X_test_cleaned['建筑面积'].replace(0, float('nan'))
        X_predict_cleaned['房间密度'] = X_predict_cleaned['室'] / X_predict_cleaned['建筑面积'].replace(0, float('nan'))
        
        # 填充可能出现的无穷值或NaN
        for df in [X_train_cleaned, X_test_cleaned, X_predict_cleaned]:
            df['房间密度'].replace([float('inf'), float('-inf')], float('nan'), inplace=True)
            df['房间密度'].fillna(df['房间密度'].median(), inplace=True)
            
        print("成功添加：房间密度")
    except Exception as e:
        print(f"添加房间密度时出错: {e}")

# 相对楼层
if '楼层数值' in X_train_cleaned.columns and '总层数' in X_train_cleaned.columns:
    try:
        # 避免除以零
        X_train_cleaned['相对楼层'] = X_train_cleaned['楼层数值'] / X_train_cleaned['总层数'].replace(0, float('nan'))
        X_test_cleaned['相对楼层'] = X_test_cleaned['楼层数值'] / X_test_cleaned['总层数'].replace(0, float('nan'))
        X_predict_cleaned['相对楼层'] = X_predict_cleaned['楼层数值'] / X_predict_cleaned['总层数'].replace(0, float('nan'))
        
        # 填充可能出现的无穷值或NaN
        for df in [X_train_cleaned, X_test_cleaned, X_predict_cleaned]:
            df['相对楼层'].replace([float('inf'), float('-inf')], float('nan'), inplace=True)
            df['相对楼层'].fillna(df['相对楼层'].median(), inplace=True)
            
        print("成功添加：相对楼层")
    except Exception as e:
        print(f"添加相对楼层时出错: {e}")

# 得房率
if '套内面积' in X_train_cleaned.columns and '建筑面积' in X_train_cleaned.columns:
    try:
        # 避免除以零
        X_train_cleaned['得房率'] = X_train_cleaned['套内面积'] / X_train_cleaned['建筑面积'].replace(0, float('nan'))
        X_test_cleaned['得房率'] = X_test_cleaned['套内面积'] / X_test_cleaned['建筑面积'].replace(0, float('nan'))
        X_predict_cleaned['得房率'] = X_predict_cleaned['套内面积'] / X_predict_cleaned['建筑面积'].replace(0, float('nan'))
        
        # 限制在合理范围内（得房率通常不超过1）
        for df in [X_train_cleaned, X_test_cleaned, X_predict_cleaned]:
            df['得房率'].replace([float('inf'), float('-inf')], float('nan'), inplace=True)
            df['得房率'] = df['得房率'].clip(0, 1)  # 限制在0-1之间
            df['得房率'].fillna(df['得房率'].median(), inplace=True)
            
        print("成功添加：得房率")
    except Exception as e:
        print(f"添加得房率时出错: {e}")

print("所有特征处理完成")

In [41]:
# 获取原始y_train的索引
original_indices = y_train.index

# 如果X_train_cleaned包含这些索引（即使顺序不同），则可以重新对齐
if set(original_indices).issubset(set(X_train_cleaned.index)):
    X_train_cleaned = X_train_cleaned.loc[original_indices]
    print("已重新对齐X_train_cleaned到y_train的索引")
else:
    # 找出哪些索引丢失了
    missing_indices = set(original_indices) - set(X_train_cleaned.index)
    print(f"有{len(missing_indices)}个在y_train中的索引在X_train_cleaned中丢失")

In [42]:
print(len(X_train_cleaned))
print(len(X_test_cleaned))
print(len(X_predict_cleaned))

print(len(y_train))
print(len(y_test))


In [43]:
# 检查测试集的索引情况
print(f"X_test_cleaned 索引是否有重复: {X_test_cleaned.index.duplicated().any()}")
print(f"y_test 索引是否有重复: {y_test.index.duplicated().any()}")

# 比较 X_test_cleaned 和 y_test 的索引
print(f"X_test_cleaned 和 y_test 索引相同的数量: {len(set(X_test_cleaned.index) & set(y_test.index))}")
print(f"X_test_cleaned 中多出的索引数量: {len(set(X_test_cleaned.index) - set(y_test.index))}")

# 使用 y_test 的索引重新对齐 X_test_cleaned
if set(y_test.index).issubset(set(X_test_cleaned.index)):
    X_test_cleaned = X_test_cleaned.loc[y_test.index]
    print("已重新对齐 X_test_cleaned 到 y_test 的索引")
else:
    # 使用两者共同的索引
    common_indices_test = X_test_cleaned.index.intersection(y_test.index)
    X_test_cleaned = X_test_cleaned.loc[common_indices_test]
    y_test = y_test.loc[common_indices_test]
    print(f"使用交集重新对齐后，X_test_cleaned: {len(X_test_cleaned)}, y_test: {len(y_test)}")

In [44]:
# 在完成数据清洗和特征工程后，添加这段代码来查看样本量
print("数据处理后的样本量统计：")
print(f"处理后训练集样本量: {len(X_train_cleaned)}")
print(f"处理后测试集样本量: {len(X_test_cleaned)}")
print(f"处理后预测集样本量: {len(X_predict_cleaned)}")

# 检查是否有缺失值导致样本减少
print("\n各数据集缺失值情况：")
print(f"训练集缺失值: {X_train_cleaned.isnull().sum().sum()}")
print(f"测试集缺失值: {X_test_cleaned.isnull().sum().sum()}")
print(f"预测集缺失值: {X_predict_cleaned.isnull().sum().sum()}")

In [45]:
# 1. 定义需要标准化的定量特征列
quantitative_columns = [
    'lon', 'lat', '年份', '建筑面积', '套内面积', '室', '厅', '厨', '卫',
    '楼层数值', '总层数', '梯户比例', '交易年份', '交易月份_sin', '交易月份_cos',
    '交易季度_sin', '交易季度_cos', '上次交易年份', '上次交易月份_sin',
    '上次交易月份_cos', '上次交易季度_sin', '上次交易季度_cos'
]

# 2. 添加已创建的领域特征
domain_features = [
    '租金估值', '房间密度', '相对楼层', '得房率'
]
# 过滤存在的列
domain_features = [col for col in domain_features if col in X_train_cleaned.columns]

# 3. 识别交互项和多项式特征
poly_features = [col for col in X_train_cleaned.columns if col.startswith('poly_')]

# 4. 识别Word2Vec特征
word2vec_cols = [col for col in X_train_cleaned.columns if 
                any(col.startswith(f"{feature}_word2vec_") for feature in 
                   ['核心卖点', '户型介绍', '周边配套', '交通出行']) or
                any(col.startswith(f"{feature}") and '_word2vec_' in col for feature in 
                   ['核心卖点', '户型介绍', '周边配套', '交通出行'])]

# 5. 合并所有需要标准化的列
numeric_cols = [col for col in quantitative_columns if col in X_train_cleaned.columns]
cols_to_standardize = numeric_cols + domain_features + poly_features + word2vec_cols

print(f"将要标准化的特征数量: {len(cols_to_standardize)}")

# 6. 初始化标准化器并拟合训练集
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_cleaned[cols_to_standardize])

# 7. 对数据集进行标准化
X_train_standardized = X_train_cleaned.copy()
X_test_standardized = X_test_cleaned.copy()
X_predict_standardized = X_predict_cleaned.copy()

X_train_standardized[cols_to_standardize] = scaler.transform(X_train_cleaned[cols_to_standardize])
X_test_standardized[cols_to_standardize] = scaler.transform(X_test_cleaned[cols_to_standardize])
X_predict_standardized[cols_to_standardize] = scaler.transform(X_predict_cleaned[cols_to_standardize])

# 8. 打印标准化后的前五行
print("标准化后的训练集前五行:")
print(X_train_standardized[cols_to_standardize].head())

In [46]:
# 储存数据，以待后续使用
X_train_cleaned.to_csv('/home/mw/temp/X_train_cleaned.csv', index=False, encoding='utf-8-sig')
X_test_cleaned.to_csv('/home/mw/temp/X_test_cleaned.csv', index=False, encoding='utf-8-sig')
X_predict_cleaned.to_csv('/home/mw/temp/X_predict_cleaned.csv', index=False, encoding='utf-8-sig')

print("\n数据已成功保存到 /home/mw/temp/ 目录下")

# 存储对应的y值
y_train.to_csv('/home/mw/temp/y_train.csv', index=False)
y_test.to_csv('/home/mw/temp/y_test.csv', index=False)

print("标签数据也已保存")