## 欢迎进入 Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [1]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

processed_file.csv  submission.csv


In [2]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

quant4533


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import re
import warnings

warnings.filterwarnings('ignore')

In [4]:
# 数据加载
train_data = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
test_data = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')
detail_data = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_details.csv')

In [5]:
detail_data = detail_data[['名称', '城市', '板块', '建筑年代', '房屋总数', '楼栋总数', 
                          '物业公司', '供水', '供暖', '供电', '停车位']]

# 1. Process 建筑年代: take first 4 characters and convert to numeric
detail_data['建筑年代'] = detail_data['建筑年代'].str[:4].astype(float)

# 2. Process 房屋总数 and 楼栋总数: remove last character and convert to numeric
for col in ['房屋总数', '楼栋总数']:
    detail_data[col] = detail_data[col].str[:-1].astype(float)

# Calculate 房屋总数/楼栋总数
detail_data['房屋楼栋比'] = detail_data['房屋总数'] / detail_data['楼栋总数']

# 3. Process 物业公司: convert to binary (0 for "无物业管理服务" or missing, 1 otherwise)
detail_data['物业公司'] = np.where(
    (detail_data['物业公司'] == "无物业管理服务") | detail_data['物业公司'].isna(),
    0,
    1
)

# 4. Process 供水: convert to numeric values
detail_data['供水'] = detail_data['供水'].map({
    '民水': 0,
    '商水': 1,
    '商水/民水': 0.5
})

# 5. Process 供电: convert to numeric values
detail_data['供电'] = detail_data['供电'].map({
    '民电': 0,
    '商电': 1,
    '商电/民电': 0.5
})

# 6. Process 供暖: 0 if contains "无供暖", 1 otherwise
detail_data['供暖'] = np.where(
    detail_data['供暖'].str.contains('无供暖', na=False),
    0,
    1
)

# 7. Process 停车位: fill missing values with mean
if detail_data['停车位'].isna().any():
    detail_data['停车位'] = detail_data['停车位'].fillna(detail_data['停车位'].mean())

# Finally, fill any remaining missing values with column means
for col in detail_data.columns:
    if detail_data[col].isna().any():
        if detail_data[col].dtype.kind in 'biufc':  # numeric types
            detail_data[col] = detail_data[col].fillna(detail_data[col].mean())
        else:
            # For non-numeric columns, you might want to use mode or other imputation
            detail_data[col] = detail_data[col].fillna(detail_data[col].mode()[0])

In [6]:
detail_data.head()

Unnamed: 0,名称,城市,板块,建筑年代,房屋总数,楼栋总数,物业公司,供水,供暖,供电,停车位,房屋楼栋比
0,三峡医专学苑新村,2,9.0,1999.450549,458.0,15.0,0,0.098352,1,0.101024,556.867636,30.533333
1,宁静苑,2,9.0,1999.450549,80.0,4.0,0,0.098352,1,0.101024,556.867636,20.0
2,电信小区,2,9.0,1999.450549,102.0,3.0,0,0.098352,1,0.101024,556.867636,34.0
3,三元街甲19号院,0,470.0,1970.0,103.0,3.0,1,0.0,1,0.0,36.0,34.333333
4,东交民巷11号院,0,126.0,1975.0,222.0,4.0,1,0.0,1,0.0,556.867636,55.5


In [7]:
def clean_data(df):
    # 面积相关字段清洗
    def clean_area(area):
        if pd.isna(area):
            return np.nan
        # 处理字符串类型的面积（带单位）
        if isinstance(area, str):
            # 去除所有非数字字符（保留小数点）
            cleaned = re.sub(r'[^\d.]', '', area)
            try:
                return float(cleaned)
            except:
                return np.nan
        return float(area)
    
    # 处理所有面积相关字段
    area_columns = ['建筑面积', '套内面积']
    for col in area_columns:
        if col in df.columns:
            df[col] = df[col].apply(clean_area)
        # 填充中位数前先检查是否有有效值（仅对建筑面积填充）
            if col == '建筑面积' and not df[col].isna().all():
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
    
    # 房屋户型解析
    if '房屋户型' in df.columns:
        def parse_house_type(house_type):
            if not isinstance(house_type, str):
                return None, None, None, None
            
            # 正则表达式匹配两种格式：X室Y厅Z厨W卫 和 X房间Y卫
            match = re.match(r'(\d+)室(\d+)厅?(\d*)厨?(\d+)卫|(\d+)房间(\d+)卫', str(house_type))
            
            if match:
                # 匹配到第一种格式：X室Y厅Z厨W卫
                if match.group(1) is not None:
                    bedrooms = int(match.group(1))
                    living_rooms = int(match.group(2)) if match.group(2) else 0
                    kitchens = int(match.group(3)) if match.group(3) else 0
                    bathrooms = int(match.group(4))
                    return bedrooms, living_rooms, kitchens, bathrooms
                # 匹配到第二种格式：X房间Y卫
                else:
                    bedrooms = int(match.group(5))
                    bathrooms = int(match.group(6))
                    living_rooms = 1
                    kitchens = 1
                    return bedrooms, living_rooms, kitchens, bathrooms
            return None, None, None, None

        new_columns = ['卧室数量', '客厅数量', '厨房数量', '卫生间数量']
        df[new_columns] = df['房屋户型'].apply(lambda x: pd.Series(parse_house_type(x)))

        for col in new_columns:
            if col in df.columns:
                df[col].fillna(0, inplace=True)

    # 楼层信息提取
    if '所在楼层' in df.columns:
        def extract_floor_info(floor):
            if not isinstance(floor, str):
                return None, None
            # 使用正则表达式提取楼层信息
            match = re.match(r'(.*?)\s*\(共(\d+)层\)', floor)
            if match:
                floor_level = match.group(1).strip()  # 楼层高低程度
                total_floors = int(match.group(2))  # 总层数，转换为整数
                return floor_level, total_floors
            return None, None

        df[['楼层高低程度', '总层数']] = df['所在楼层'].apply(lambda x: pd.Series(extract_floor_info(x)))

        floor_level_mapping = {
            '低楼层': 1,
            '中楼层': 2,
            '高楼层': 3,
            '底层': 0,
            '顶层': 4
        }

        df['楼层高低程度'] = df['楼层高低程度'].map(floor_level_mapping)

        df['楼层高低程度'] = df['楼层高低程度'].fillna(2).astype(int)

        df['总层数'] = pd.to_numeric(df['总层数'], errors='coerce')  # 确保数据是数值型
        df['总层数'] = df['总层数'].fillna(df['总层数'].median()).astype(int)

    df['环线'] = df['环线'].astype(str).str.replace(r'[\s　\u200b]+', '', regex=True).str.strip()
    ring_mapping = {
        '一环内': 1, '二环内': 2, '二至三环': 2.5,
        '三环外': 3, '三至四环': 3.5, 
        '四环外': 4, '四至五环': 4.5,
        '五环': 5, '五至六环': 5.5,
        '六环外': 6, '内环内': 1, '外环外': 7,
        '内环至外环': 2, '内环至中环': 4, '中环至外环': 6,
        '一至二环': 1.5  # 补充缺失项
    }
    df['环线_调整'] = df['环线'].map(ring_mapping)
    df['环线_调整'] = df['环线_调整'].fillna(df['环线_调整'].median())

    if '城市' in df.columns:
        for city_code in range(7):  # 假设城市编码是0-6
            df[f'城市_{city_code}'] = (df['城市'] == city_code).astype(int)

    # 交通出行特征提取
    if '交通出行' in df.columns:
        transport_keywords = ['公交', '地铁']
        df['交通出行'] = df['交通出行'].fillna('').str.strip().str.replace('　', '') 
        
        for keyword in transport_keywords:
            df[f'交通_{keyword}'] = df['交通出行'].str.contains(keyword, case=False, na=False).astype(int)

    # 周边配套特征提取
    if '周边配套' in df.columns:
        facilities = ['医院', '公园', '超市', '商场', '银行', '幼儿园', '学校']
        df['周边配套'] = df['周边配套'].fillna('').str.strip().str.replace('　', '')

        for facility in facilities:
            df[f'配套_{facility}'] = df['周边配套'].str.contains(facility, case=False, na=False).astype(int)

    # 房屋朝向特征提取
    if '房屋朝向' in df.columns:
        directions = ['东', '南', '西', '北']
        df['房屋朝向'] = df['房屋朝向'].fillna('').str.strip().str.replace('　', '')

        for direction in directions:
            df[f'朝向_{direction}'] = df['房屋朝向'].str.contains(direction, case=False, na=False).astype(int)

    # 建筑面积优化
    if '套内面积' in df.columns and '建筑面积' in df.columns:
        # 当"套内面积"不是缺失值时，取"套内面积"；否则保留"建筑面积"
        df['使用面积'] = np.where(
            df['套内面积'].notna(),
            df['套内面积'],
            df['建筑面积']
        )
    
    # 建筑结构量化
    if '建筑结构' in df.columns:
        structure_rank = {
            '钢结构': 6,        # 强度最高
            '框架结构': 5,      # 抗震性强
            '钢混结构': 4,      # 如框架-剪力墙结构
            '混合结构': 3,      # 介于钢混和砖混之间
            '砖混结构': 2,      # 低层建筑常见
            '砖木结构': 1,      # 强度最弱
            '未知结构': np.nan,
            None: np.nan
        }
        
        df['建筑结构'] = df['建筑结构'].map(structure_rank)
        mean_rank = df['建筑结构'].mean()
        df['建筑结构'] = df['建筑结构'].fillna(mean_rank)

    if '装修情况' in df.columns:
        # 定义映射规则
        decoration_rank = {
            '精装': 4,    # 最高精度装修
            '简装': 3,    # 中等精度装修
            '毛坯': 1,    # 最低精度
            '其他': np.nan,
            None: np.nan
        }
        
        df['装修情况_调整'] = df['装修情况'].map(decoration_rank)
        mean_decoration = df['装修情况_调整'].mean()
        df['装修情况_调整'] = df['装修情况_调整'].fillna(mean_decoration)  

    if '房屋年限' in df.columns:
        year_rank = {
            '满五年': 5,  
            '满两年': 2,    
            '未满两年': 0
        }
        df['房屋年限'] = df['房屋年限'].map(year_rank)
        df['房屋年限'] = df['房屋年限'].fillna(1)

    if '别墅类型' in df.columns:
        df['别墅类型标志'] = np.where(
            df['别墅类型'].isna(),  # 判断条件
            0,                     # 空值赋值为0
            1                      # 非空赋值为1
        )

    # 对经纬度进行标准化
    if all(col in df.columns for col in ['lon', 'lat']):
        # 创建副本避免SettingWithCopyWarning
        coords = df[['lon', 'lat']].copy()
        
        # 检查是否有有效值
        if not coords.isnull().all().all():
            scaler = StandardScaler()  # 初始化标准化器
            scaled_coords = scaler.fit_transform(coords)
            # 更新到DataFrame
            df['lon'] = scaled_coords[:, 0]
            df['lat'] = scaled_coords[:, 1]    

    if '配备电梯' in df.columns:
        ladder_rank = {
            '有': 1,
            '无': 0,
            None: np.nan,
        }
        df['配备电梯'] = df['配备电梯'].str.strip().str.replace(' ', '')
        df['配备电梯_d'] = df['配备电梯'].map(ladder_rank)
    mean_rank = df['配备电梯_d'].mean()
    df['配备电梯_d'] = df['配备电梯_d'].fillna(mean_rank)
    

    def parse_ladder_ratio(ratio_str, has_elevator):
        # 中文数字到阿拉伯数字的映射
        chinese_num_map = {
            '零':0, '一':1, '二':2, '两':2, '三':3, '四':4,
            '五':5, '六':6, '七':7, '八':8, '九':9, '十':10
        }
        # 如果输入为空，直接返回0
        if pd.isna(ratio_str):
            return 0.0
        try:
            # 使用正则表达式提取梯数和户数
            match = re.search(r'([零一二两三四五六七八九十]+)梯([零一二两三四五六七八九十]+)户', str(ratio_str))
            if not match:
                return 0.0 
            # 转换中文数字
            def cn_to_num(cn):
                cn = str(cn)
                if cn in chinese_num_map:
                    return chinese_num_map[cn]
                if len(cn) == 2 and cn[0] == '十':  # 如"十一"
                    return 10 + chinese_num_map.get(cn[1], 0)
                if len(cn) == 2 and cn[1] == '十':  # 如"二十"
                    return chinese_num_map.get(cn[0], 0) * 10
                return None
                
            ladder = cn_to_num(match.group(1))

            household = cn_to_num(match.group(2))
            
            # 计算比例（确保分母不为零）
            if ladder is not None and household is not None and household != 0:
                return round(ladder / household, 2)
            return 0.0
        except:
            return 0.0

    df['梯户比例'] = df['梯户比例'].apply(lambda x: parse_ladder_ratio(x, None))

    if '建筑面积' in df.columns:
        df['建筑面积平方'] = df['建筑面积'] ** 2
       
    # 建筑面积 × 卧室数量（衡量空间舒适度）
    if all(col in df.columns for col in ['建筑面积','卧室数量']):
        df['面积_卧室比'] = df['建筑面积'] / (df['卧室数量'] + 1e-6)  # 防止除零
        
    # 交通 × 商业配套（地铁房+商圈的复合价值）
    if all(col in df.columns for col in ['交通_地铁','配套_商场']):
        df['地铁_商圈交互'] = df['交通_地铁'] * df['配套_商场']
        
    # 朝向组合（南北通透的溢价）
    if all(col in df.columns for col in ['朝向_南','朝向_北']):
        df['南北通透'] = df['朝向_南'] * df['朝向_北']
      
    return df

In [8]:
# 应用清洗函数
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [9]:
# 合并训练数据与rent数据
train_merged = pd.merge(train_data, detail_data, 
                 left_on=['城市', '板块', '小区名称'],
                 right_on=['城市', '板块', '名称'],
                 how='left')

train_cleaned = train_merged[['价格',
    '城市_0','城市_1','城市_2','城市_3','城市_4','城市_5','建筑面积','卧室数量','卫生间数量','楼层高低程度','总层数','环线_调整', 'lon','lat',
    '交通_公交', '交通_地铁','配套_医院','配套_公园','配套_超市','配套_商场','朝向_东','朝向_南','朝向_西','朝向_北',
    '装修情况_调整', '房屋年限', '别墅类型标志','配备电梯_d', '梯户比例','建筑面积平方', '建筑年代','停车位','房屋楼栋比',
    '面积_卧室比','地铁_商圈交互','南北通透'
]]

# 合并测试数据与rent数据
test_merged = pd.merge(test_data, detail_data, 
                 left_on=['城市', '板块', '小区名称'],
                 right_on=['城市', '板块', '名称'],
                 how='left')

test_cleaned = test_merged[[
    '城市_0','城市_1','城市_2','城市_3','城市_4','城市_5','建筑面积','卧室数量','卫生间数量','楼层高低程度','总层数','环线_调整', 'lon','lat',
    '交通_公交', '交通_地铁','配套_医院','配套_公园','配套_超市','配套_商场','朝向_东','朝向_南','朝向_西','朝向_北',
    '装修情况_调整', '房屋年限', '别墅类型标志','配备电梯_d', '梯户比例','建筑面积平方', '建筑年代','停车位','房屋楼栋比',
    '面积_卧室比','地铁_商圈交互','南北通透'
]]

In [10]:
print(train_cleaned.columns)

Index(['价格', '城市_0', '城市_1', '城市_2', '城市_3', '城市_4', '城市_5', '建筑面积', '卧室数量',
       '卫生间数量', '楼层高低程度', '总层数', '环线_调整', 'lon', 'lat', '交通_公交', '交通_地铁',
       '配套_医院', '配套_公园', '配套_超市', '配套_商场', '朝向_东', '朝向_南', '朝向_西', '朝向_北',
       '装修情况_调整', '房屋年限', '别墅类型标志', '配备电梯_d', '梯户比例', '建筑面积平方', '建筑年代', '停车位',
       '房屋楼栋比', '面积_卧室比', '地铁_商圈交互', '南北通透', '楼层占比'],
      dtype='object')


In [11]:
# 选择特征和目标变量
features = ['城市_0','城市_1','城市_2','城市_3','城市_4','城市_5','建筑面积','建筑面积平方','卧室数量','卫生间数量','环线_调整', 'lon','lat','楼层高低程度','总层数',
    '交通_公交', '交通_地铁','配套_医院','配套_公园','配套_超市','配套_商场','朝向_东','朝向_南','朝向_西','朝向_北','建筑年代',
    '装修情况_调整', '房屋年限', '配备电梯_d', '梯户比例', '停车位','房屋楼栋比','别墅类型标志',#'面积_卧室比','地铁_商圈交互','南北通透'
]
target = '价格'

In [12]:
X = train_cleaned[features]
y = train_cleaned['价格']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

numeric_features = X.columns.tolist()  

preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()),  
])

# 预处理数据
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [13]:
# 1. 定义模型
models = {
    'OLS': LinearRegression(),          # 普通最小二乘回归
    'LASSO': Lasso(alpha=0.1),          # L1正则化回归
    'Ridge': Ridge(alpha=1.0),          # L2正则化回归
    'ElasticNet': ElasticNet(alpha=0.8, l1_ratio=0.8)  # L1+L2正则化
}

# 2. 评估函数
def evaluate_model(model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)  

    y_train_pred = pipeline.predict(X_train)  
    y_test_pred = pipeline.predict(X_test)
    
    # 计算指标
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    # 交叉验证
    cv = KFold(n_splits=6, shuffle=True, random_state=111)
    cv_scores_mae = -cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    cv_scores_rmse = -cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error')
    
    return {
        'train_mae': float(train_mae), 
        'test_mae': float(test_mae),
        'train_rmse': float(train_rmse), 
        'test_rmse': float(test_rmse),
        'cv_mae': float(np.mean(cv_scores_mae)), 
        'cv_rmse': float(np.mean(cv_scores_rmse)),
        'pipeline': pipeline
    }

# 3. 训练并比较所有模型
results = {}
best_model = None
best_score = float('inf')

for name, model in models.items():
    print(f"Evaluating {name}...")
    result = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = dict(result) 
    if result['test_rmse'] < best_score:
        best_score = result['test_rmse']
        best_model = name

print(f"\nBest model: {best_model} with test RMSE: {best_score:.2f}")

Evaluating OLS...
Evaluating LASSO...


KeyboardInterrupt: 

In [21]:
from sklearn.model_selection import GridSearchCV

# 1. 定义参数网格（注意添加model__前缀）
param_grid = {
    'model__alpha': [0.01,0.1,0.5,1],  # 正则化强度
    'model__l1_ratio': [0.6, 0.7,0.8]       # L1/L2比例
}

# 2. 创建预处理+模型的pipeline（保持不变）
elastic_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(max_iter=10000))
])

# 3. 网格搜索（使用6折交叉验证）
elastic_gs = GridSearchCV(elastic_pipe, 
                        param_grid,
                        cv=6,
                        scoring='neg_root_mean_squared_error',
                        n_jobs=-1,
                        verbose=1)
print("开始网格搜索...")
elastic_gs.fit(X_train, y_train)

# 4. 输出最佳参数
print("\n最佳参数组合: ", elastic_gs.best_params_)
print("最佳RMSE: ", -elastic_gs.best_score_)

# ========== 参数搜索可视化 ==========
# 5. 提取网格搜索结果并调整列名（去掉model__前缀）
results = pd.DataFrame(elastic_gs.cv_results_)
results = results.rename(columns={
    'param_model__alpha': 'alpha',
    'param_model__l1_ratio': 'l1_ratio'
})

# ========== 特征重要性分析 ==========
# 获取最佳模型
best_elastic = elastic_gs.best_estimator_

# 提取特征重要性
feature_importance = pd.DataFrame({
    'feature': numeric_features,
    'coefficient': best_elastic.named_steps['model'].coef_,
    'abs_coef': np.abs(best_elastic.named_steps['model'].coef_)
}).sort_values('abs_coef', ascending=False)

# 输出系数表格
print("\n特征重要性排序：")
print(feature_importance.head(15))

In [23]:
metrics_df = pd.DataFrame({
    'Model': list(results.keys()),
    'In sample MAE': [float(res['train_mae']) for res in results.values()],
    'Out of sample MAE': [float(res['test_mae']) for res in results.values()],
    'CV MAE': [float(res['cv_mae']) for res in results.values()],
    'In sample RMSE': [float(res['train_rmse']) for res in results.values()],
    'Out of sample RMSE': [float(res['test_rmse']) for res in results.values()],
    'CV RMSE': [float(res['cv_rmse']) for res in results.values()]
})

print("\nPerformance Metrics:")
print(metrics_df)

In [108]:
best_pipeline = results[best_model]['pipeline']
best_pipeline.fit(X, y)  # 在整个训练集上重新训练

# 准备测试数据
X_final_test = test_cleaned[features]

# 预测
predictions = best_pipeline.predict(X_final_test)

In [109]:
submission = pd.DataFrame({
    'ID': range(len(predictions)),  # 从0开始生成连续的ID
    'price': predictions
})

submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")