In [29]:
import numpy as np
import pandas as pd
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# 加载 train.csv 文件
train_file_path = "./data/train.csv"
test_file_path = "./data/test.csv"
train_df = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

# 确认数据加载成功，并查看数据的前几行
print("数据集前几行：")
print(train_df.head())

# 输出列名以确认数据集结构
print("数据集的列名：", train_df.columns)

数据集前几行：
   listing_id                                              title  \
0     1292132  Land Rover Range Rover Velar 3.0A Si6 R-Dynami...   
1     1294696   Mercedes-Benz C-Class C200 Sport Premium Sunroof   
2     1311717              Honda Odyssey 2.4A (COE till 09/2027)   
3     1310068       Toyota Corolla Altis 1.6A (COE till 12/2028)   
4     1325280                     Lexus GS300 (COE till 06/2026)   

            make    model                                        description  \
0     land rover    range  1 owner, no repairs needed! it looks great, in...   
1  mercedes-benz     c200  rare beautiful white c200 sport premium sunroo...   
2          honda  odyssey            comes with warranty. full service done.   
3         toyota    altis                                                  0   
4          lexus       gs  wear and tear done up. well maintained and reg...   

   manufactured original_reg_date     reg_date  type_of_vehicle  \
0        2018.0               NaN  

## Encoding

### Target Encoding

In [30]:
def target_encode_make(df, column, target):
    """使用MEstimateEncoder对make进行目标编码"""
    # 初始化编码器
    encoder = MEstimateEncoder(
        cols=[column],
        m=5.0,  # 平滑参数
    )
    
    # 在全量数据上训练编码器
    encoder.fit(df[[column]], df[target])
    
    # 计算默认值（用于处理未见过的类别）
    default_mean = df[target].mean()
    
    return encoder, default_mean

def apply_target_encoding(df, column, encoder, default_mean):
    """应用编码器到数据集"""
    # 创建数据副本并只保留需要的列
    df_temp = df[[column]].copy()
    
    # 转换数据
    encoded_values = encoder.transform(df_temp)
    
    # 将编码结果添加到原始数据框中
    df[f"{column}_target_encoded"] = encoded_values[column]
    
    return df

### Multi-label Encoding

In [31]:
def encode_categories_train(df, column_name):
    """在训练数据上初始化并应用MultiLabelBinarizer，返回编码器以备未来使用"""
    # 将字符串转换为列表，每个类别作为列表的一个元素
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 初始化MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    # 使用MultiLabelBinarizer进行编码
    mlb.fit_transform(df[f"{column_name}_list"])

    return mlb

def apply_categories_encoding(df, column_name, mlb):
    """应用已保存的MultiLabelBinarizer到新DataFrame"""
    # 转换字符串为列表
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 使用MultiLabelBinarizer进行编码
    df_encoded = mlb.transform(df[f"{column_name}_list"])

    # 转换回DataFrame并添加列名
    df_encoded = pd.DataFrame(df_encoded, columns=mlb.classes_, index=df.index)

    # 将编码后的DataFrame合并到原始DataFrame
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(columns=[column_name, f"{column_name}_list"])

    return df

### One-hot Encoding

In [32]:
def onehot_encode_columns_train(df, columns):
    """在训练数据上初始化并应用OneHotEncoder，返回编码器以备未来使用"""
    encoders = {}
    for column in columns:
        onehot_encoder = OneHotEncoder()
        # 注意这里转换为DataFrame是为了保持输入格式一致
        df_encoded = onehot_encoder.fit_transform(df[[column]])
        encoders[column] = onehot_encoder
    return encoders

def apply_onehot_encoding(df, columns, encoders):
    """应用已保存的OneHotEncoder到新DataFrame"""
    for column in columns:
        # 使用已保存的编码器进行transform操作，并转换为数组
        df_encoded = encoders[column].transform(df[[column]]).toarray()

        # 转换回DataFrame，列名使用encoder中的类别名称
        df_encoded = pd.DataFrame(df_encoded, columns=encoders[column].get_feature_names_out([column]), index=df.index)

        # 将编码后的DataFrame合并到原始DataFrame
        df = pd.concat([df, df_encoded], axis=1)
        df = df.drop(columns=column)

    return df

## Data Proecessing

In [33]:
import json

# 定义列名
columns_dict = {
    'del_cols': ['listing_id', 'original_reg_date','opc_scheme', 'lifespan','eco_category', 'indicative_price'],
    'text_cols': ['title', 'description', 'features', 'accessories'],
    'date_cols': ['reg_date'],
    'numeric_cols': ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month'],
    'log_cols': ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month'],
    'root_cols': ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month'],
    'categorical_cols': ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']
}

# 保存到JSON文件
with open('./data/columns.json', 'w') as f:
    json.dump(columns_dict, f, indent=4)
print("列名配置已成功保存到 ./data/columns.json")

# 从字典中读取列名
del_cols = columns_dict['del_cols']
text_cols = columns_dict['text_cols'] 
date_cols = columns_dict['date_cols']
numeric_cols = columns_dict['numeric_cols']
log_cols = columns_dict['log_cols']
root_cols = columns_dict['root_cols']
categorical_cols = columns_dict['categorical_cols']

def get_maxmin_dict(data, numeric_cols):
    max_dict = dict()
    min_dict = dict()
    for feature in numeric_cols:
        max_dict[feature] = data[feature].max()
        min_dict[feature] = data[feature].min()
    return max_dict, min_dict

列名配置已成功保存到 ./data/columns.json


In [34]:
def preprocess_data_cat(data, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders):
    """处理分类特征"""
    data = data.drop(columns=del_cols)
    data = data.drop(columns=text_cols)
    
    # 应用目标编码
    data = apply_target_encoding(data, 'make', target_encoder, default_mean)

    # 应用多标签二值化编码
    data = apply_categories_encoding(data, 'category', mlb_encoder)

    # 应用OneHot编码
    data = apply_onehot_encoding(data, ['type_of_vehicle', 'fuel_type', 'transmission'], onehot_encoders)

    # 处理日期特征
    data['reg_date'] = pd.to_datetime(data['reg_date'], format='%d-%b-%Y')  
    data['year'] = data['reg_date'].dt.year
    data['month'] = data['reg_date'].dt.month
    data = data.drop(columns='reg_date')
    data['no_of_owners'] = data['no_of_owners'].fillna(2)
    
    return data


def preprocess_data_num(data, max_dict, min_dict, 
                       remove_outliers=False,
                       do_normalize=False, 
                       normalize_method='standard'):
    """
    处理数值特征
    
    参数:
    - data: DataFrame, 输入数据
    - max_dict: dict, 存储最大值/均值
    - min_dict: dict, 存储最小值/标准差
    - remove_outliers: bool, 是否移除异常值
    - do_normalize: bool, 是否进行归一化
    - normalize_method: str, 归一化方法 ('standard' 或 'minmax')
    """
    # 1. 填充缺失值
    for feature in numeric_cols:
        data[feature] = data[feature].fillna(data[feature].median())
    
    # 2. 创建异常值掩码
    mask = ~((data[numeric_cols] - data[numeric_cols].mean()).abs() > 3 * data[numeric_cols].std()).any(axis=1)
    
    # 如果需要移除异常值
    if remove_outliers:
        data = data[mask]
    
    # 3. 特征转换
    # 对长尾特征进行对数变换和平方根变换
    long_tail_features = ['omv', 'arf', 'depreciation', 'dereg_value', 'power', 'engine_cap', 'road_tax', 'mileage']
    for feature in long_tail_features:
        data[f'{feature}_log'] = np.log1p(data[feature])
        data[f'{feature}_root'] = np.sqrt(data[feature])

    
    # 4. 归一化处理
    if do_normalize:
        # 归一化原始特征
        for feature in numeric_cols:
            if normalize_method == 'standard':
                max_dict[f"{feature}_mean"] = data[feature].mean()
                min_dict[f"{feature}_std"] = data[feature].std()
                data[feature] = (data[feature] - max_dict[f"{feature}_mean"]) / min_dict[f"{feature}_std"]
            else:  # minmax
                max_dict[feature] = data[feature].max()
                min_dict[feature] = data[feature].min()
                data[feature] = (data[feature] - min_dict[feature]) / (max_dict[feature] - min_dict[feature])
        
        # 归一化对数变换和平方根变换后的特征
        for feature in long_tail_features:
            log_name = f'{feature}_log'
            root_name = f'{feature}_root'
            
            if normalize_method == 'standard':
                # 存储均值和标准差
                max_dict[f"{log_name}_mean"] = data[log_name].mean()
                min_dict[f"{log_name}_std"] = data[log_name].std()
                max_dict[f"{root_name}_mean"] = data[root_name].mean()
                min_dict[f"{root_name}_std"] = data[root_name].std()
                
                # 标准化
                data[log_name] = (data[log_name] - max_dict[f"{log_name}_mean"]) / min_dict[f"{log_name}_std"]
                data[root_name] = (data[root_name] - max_dict[f"{root_name}_mean"]) / min_dict[f"{root_name}_std"]
            else:  # minmax
                # 存储最大最小值
                max_dict[log_name] = data[log_name].max()
                min_dict[log_name] = data[log_name].min()
                max_dict[root_name] = data[root_name].max()
                min_dict[root_name] = data[root_name].min()
                
                # 最小最大归一化
                data[log_name] = (data[log_name] - min_dict[log_name]) / (max_dict[log_name] - min_dict[log_name])
                data[root_name] = (data[root_name] - min_dict[root_name]) / (max_dict[root_name] - min_dict[root_name])
    
    return data, mask if remove_outliers else None

In [35]:
do_normalize = False
normalize_method='minmax'

# 第一部分：使用train数据训练模型
print("第一部分：使用train数据进行预处理...")
X_train, y_train = train.drop(columns=['price']), train['price']
X_valid, y_valid = valid.drop(columns=['price']), valid['price']

# 使用train数据生成encoders
target_encoder_train, default_mean_train = target_encode_make(train, 'make', 'price')
mlb_encoder_train = encode_categories_train(train, 'category')
onehot_encoders_train = onehot_encode_columns_train(train, ['type_of_vehicle', 'fuel_type', 'transmission'])

# 处理分类特征
X_train = preprocess_data_cat(X_train, del_cols, text_cols, target_encoder_train, default_mean_train, mlb_encoder_train, onehot_encoders_train)
X_valid = preprocess_data_cat(X_valid, del_cols, text_cols, target_encoder_train, default_mean_train, mlb_encoder_train, onehot_encoders_train)

# 处理数值特征
max_dict_train, min_dict_train = get_maxmin_dict(X_train, numeric_cols)
X_train, mask = preprocess_data_num(X_train, max_dict_train, min_dict_train, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)
if mask is not None:
    y_train = y_train[mask]
X_valid, mask = preprocess_data_num(X_valid, max_dict_train, min_dict_train, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)

# 保存训练用数据
import os
os.makedirs('data/processed', exist_ok=True)
X_train.to_csv('data/processed/X_train.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
X_valid.to_csv('data/processed/X_valid.csv', index=False)
y_valid.to_csv('data/processed/y_valid.csv', index=False)
print('训练数据已保存到 data/processed/ 目录下')

# 第二部分：使用全量数据进行预处理...
print("第二部分：使用全量数据进行预处理...")
X_test = test

# 1. 先处理全量训练数据
X_train_full = train_df.drop(columns=['price'])
y_train_full = train_df['price']

# 2. 使用全量训练数据生成encoders
target_encoder_full, default_mean_full = target_encode_make(train_df, 'make', 'price')
mlb_encoder_full = encode_categories_train(train_df, 'category')
onehot_encoders_full = onehot_encode_columns_train(train_df, ['type_of_vehicle', 'fuel_type', 'transmission'])

# 3. 处理分类特征
X_train_full = preprocess_data_cat(X_train_full, del_cols, text_cols, target_encoder_full, default_mean_full, mlb_encoder_full, onehot_encoders_full)
X_test = preprocess_data_cat(X_test, del_cols, text_cols, target_encoder_full, default_mean_full, mlb_encoder_full, onehot_encoders_full)

# 4. 获取最大最小值字典（使用处理后的全量训练数据）
max_dict_full, min_dict_full = get_maxmin_dict(X_train_full, numeric_cols)

# 5. 处理数值特征
X_train_full, mask = preprocess_data_num(X_train_full, max_dict_full, min_dict_full, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)
if mask is not None:
    y_train_full = y_train_full[mask]
X_test, mask = preprocess_data_num(X_test, max_dict_full, min_dict_full, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)

# 保存处理后的全量训练数据和测试数据
X_train_full.to_csv('data/processed/X_train_full.csv', index=False)
y_train_full.to_csv('data/processed/y_train_full.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
print('全量训练数据和测试数据已保存到 data/processed/ 目录下')


第一部分：使用train数据进行预处理...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_log'] = np.log1p(data[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_root'] = np.sqrt(data[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_log'] = np.log1p(data[feature])
A value is trying to be set on a copy of a slice from a Da

训练数据已保存到 data/processed/ 目录下
第二部分：使用全量数据进行预处理...
全量训练数据和测试数据已保存到 data/processed/ 目录下
