In [61]:
# Some magic so that the notebook will reload the external python script file any time you edit and save the .py file;
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
import numpy as np
import pandas as pd
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

In [63]:
# 加载 train.csv 文件
train_file_path = "./train.csv"
test_file_path = "./test.csv"
train_df = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

# 确认数据加载成功，并查看数据的前几行
print("数据集前几行：")
print(train_df.head())

数据集前几行：
   listing_id                                              title           make    model                                        description  manufactured original_reg_date     reg_date  type_of_vehicle                                  category transmission  curb_weight  power fuel_type  engine_cap  no_of_owners  depreciation    coe  road_tax  dereg_value   mileage      omv       arf opc_scheme lifespan   eco_category                                           features                                        accessories  indicative_price     price
0     1292132  Land Rover Range Rover Velar 3.0A Si6 R-Dynami...     land rover    range  1 owner, no repairs needed! it looks great, in...        2018.0               NaN  08-mar-2018              suv                                  parf car         auto       1884.0  280.0       NaN      2995.0           2.0       34270.0  48011    2380.0     103323.0   96000.0  88906.0  132031.0        NaN      NaN  uncategorized  3l supercharged v6 

In [64]:
# 输出列名以确认数据集结构
print("数据集的列名：", train_df.columns)

数据集的列名： Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured', 'original_reg_date', 'reg_date', 'type_of_vehicle', 'category', 'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category', 'features', 'accessories', 'indicative_price', 'price'], dtype='object')


In [65]:
# 检查目标列是否存在
required_columns = ['make', 'type_of_vehicle', 'category', 'transmission', 'price']  # 需要处理的列
missing_columns = [col for col in required_columns if col not in train_df.columns]
if missing_columns:
    raise ValueError(f"数据集中缺少以下列：{missing_columns}")


In [66]:
# 3. 使用交叉验证来对 `make` 进行目标编码，防止数据泄漏
def target_encode_cross_validation(df, column, target, n_splits=5):
    """对类别特征进行目标编码，并使用交叉验证防止数据泄漏"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    overall_mean = df[target].mean()  # 计算总体均值（用于平滑）
    category_means = {}

    for train_index, val_index in kf.split(df):
        train_fold = df.iloc[train_index]
        mean_encoded = train_fold.groupby(column)[target].mean()
        category_means.update(mean_encoded)

    # 保存所有类别的平均值，如果类别未在某个折中出现，使用总体均值作为回退
    category_means = {k: v for k, v in category_means.items()}
    default_mean = overall_mean

    return category_means, default_mean

def apply_target_encoding(df, column, encoded_dict, default_mean):
    """应用保存的目标编码到新的DataFrame"""
    df[f"{column}_target_encoded"] = df[column].map(encoded_dict).fillna(default_mean)
    if df[f"{column}_target_encoded"].isnull().any():
        df[f"{column}_target_encoded"].fillna(default_mean, inplace=True)
    return df


In [67]:
# # ============= 2. 对 `category` 使用频率编码 ============= #
# # 计算每个类别的频率
# category_frequency = train_df['category'].value_counts(normalize=True)
# # 将 `category` 映射为其在数据集中的出现频率
# train_df['category_frequency_encoded'] = train_df['category'].map(category_frequency)
# print("\n`category` 频率编码后的结果：")
# print(train_df[['category', 'category_frequency_encoded']].head())

In [68]:
def encode_categories_train(df, column_name):
    """在训练数据上初始化并应用MultiLabelBinarizer，返回编码器以备未来使用"""
    # 将字符串转换为列表，每个类别作为列表的一个元素
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 初始化MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    # 使用MultiLabelBinarizer进行编码
    mlb.fit_transform(df[f"{column_name}_list"])

    return mlb

def apply_categories_encoding(df, column_name, mlb):
    """应用已保存的MultiLabelBinarizer到新DataFrame"""
    # 转换字符串为列表
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 使用MultiLabelBinarizer进行编码
    df_encoded = mlb.transform(df[f"{column_name}_list"])

    # 转换回DataFrame并添加列名
    df_encoded = pd.DataFrame(df_encoded, columns=mlb.classes_, index=df.index)

    # 将编码后的DataFrame合并到原始DataFrame
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(columns=[column_name, f"{column_name}_list"])

    return df




In [69]:
# # ============= 3. 对 `type_of_vehicle` 使用标签编码 ============= #
# label_encoder = LabelEncoder()  # 初始化标签编码器
# train_df['type_of_vehicle_label_encoded'] = label_encoder.fit_transform(train_df['type_of_vehicle'])
# print("\n`type_of_vehicle` 标签编码后的结果：")
# print(train_df[['type_of_vehicle', 'type_of_vehicle_label_encoded']].head())

In [70]:
def onehot_encode_columns_train(df, columns):
    """在训练数据上初始化并应用OneHotEncoder，返回编码器以备未来使用"""
    encoders = {}
    for column in columns:
        onehot_encoder = OneHotEncoder()
        # 注意这里转换为DataFrame是为了保持输入格式一致
        df_encoded = onehot_encoder.fit_transform(df[[column]])
        encoders[column] = onehot_encoder
    return encoders

def apply_onehot_encoding(df, columns, encoders):
    """应用已保存的OneHotEncoder到新DataFrame"""
    for column in columns:
        # 使用已保存的编码器进行transform操作，并转换为数组
        df_encoded = encoders[column].transform(df[[column]]).toarray()

        # 转换回DataFrame，列名使用encoder中的类别名称
        df_encoded = pd.DataFrame(df_encoded, columns=encoders[column].get_feature_names_out([column]), index=df.index)

        # 将编码后的DataFrame合并到原始DataFrame
        df = pd.concat([df, df_encoded], axis=1)
        df = df.drop(columns=column)

    return df



In [71]:
del_cols = ['listing_id', 'original_reg_date','opc_scheme', 'lifespan','eco_category', 'indicative_price']
text_cols = ['title', 'description', 'features', 'accessories'] # may need further processing
date_cols = ['reg_date'] # need to transform
numeric_cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month']
log_cols = ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month']
root_cols = ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month']
categorical_cols = ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']

In [72]:
def get_maxmin_dict(data, numeric_cols):
    max_dict = dict()
    min_dict = dict()
    for feature in numeric_cols:
        max_dict[feature] = data[feature].max()
        min_dict[feature] = data[feature].min()
    return max_dict, min_dict


In [73]:
def preprocess_data_cat(data, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders):
    data = data.drop(columns=del_cols)
    data = data.drop(columns=text_cols)
    # 应用目标编码
    data = apply_target_encoding(data, 'make', target_encoder, default_mean)

    # 应用多标签二值化编码
    data = apply_categories_encoding(data, 'category', mlb_encoder)

    # 应用OneHot编码
    data = apply_onehot_encoding(data, ['type_of_vehicle', 'fuel_type', 'transmission'], onehot_encoders)

    data['reg_date'] = pd.to_datetime(data['reg_date'], format='%d-%b-%Y')  
    data['year'] = data['reg_date'].dt.year
    data['month'] = data['reg_date'].dt.month
    data = data.drop(columns='reg_date')
    data['no_of_owners'].fillna(2, inplace=True)
    return data
    
def preprocess_data_num(data, max_dict, min_dict):
    for feature in numeric_cols:
        data[feature] = data[feature].fillna(data[feature].median())

    long_tail_features = ['omv', 'arf', 'depreciation', 'dereg_value']
    for feature in long_tail_features:
        data[f'{feature}_log'] = np.log1p(data[feature])
        data[f'{feature}_root'] = np.sqrt(data[feature])

    root_transform_features = ['power', 'engine_cap', 'road_tax', 'mileage']
    for feature in root_transform_features:
        data[f'{feature}_root'] = np.sqrt(data[feature])
        data[f'{feature}_log'] = np.log1p(data[feature])
        
    data[feature] = (data[feature] - data[feature].min()) / (data[feature].max() - data[feature].min())
    if feature in long_tail_features or feature in root_transform_features:
        max_dict[f'{feature}_log'] = data[f'{feature}_log'].max()
        min_dict[f'{feature}_log'] = data[f'{feature}_log'].min()
        data[f'{feature}_log'] = (data[f'{feature}_log'] - data[f'{feature}_log'].min()) / (data[f'{feature}_log'].max() - data[f'{feature}_log'].min())
        max_dict[f'{feature}_root'] = data[f'{feature}_root'].max()
        min_dict[f'{feature}_root'] = data[f'{feature}_root'].min()
        data[f'{feature}_root'] = (data[f'{feature}_root'] - data[f'{feature}_root'].min()) / (data[f'{feature}_root'].max() - data[f'{feature}_root'].min())
    return data

In [74]:
X_train, y_train  = train.drop(columns=['price']), train['price']
X_valid, y_valid = valid.drop(columns=['price']), valid['price']
X_test = test

target_encoder, default_mean = target_encode_cross_validation(train, 'make', 'price')
mlb_encoder = encode_categories_train(train, 'category')
onehot_encoders = onehot_encode_columns_train(train, ['type_of_vehicle', 'fuel_type', 'transmission'])


X_train = preprocess_data_cat(X_train, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)
X_valid = preprocess_data_cat(X_valid, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)
X_test = preprocess_data_cat(X_test, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)




In [75]:
max_dict, min_dict = get_maxmin_dict(X_train, numeric_cols)

X_train = preprocess_data_num(X_train, max_dict, min_dict)
X_valid = preprocess_data_num(X_valid, max_dict, min_dict)
X_test = preprocess_data_num(X_test, max_dict, min_dict)

In [76]:
# # 5. 保存处理后的数据集
# processed_file_path = "./processed_train.csv"
# data.to_csv(processed_file_path, index=False)
# print(f"处理后的数据已保存至：{processed_file_path}")

# with open('max.json', 'w') as f:
#     json.dump(max_dict, f)
    
# with open('min.json', 'w') as f:
#     json.dump(min_dict, f)

In [77]:

for li in [numeric_cols, log_cols, root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Mean Squared Error: 2656064623.4808936
Root Mean Squared Error: 51537.021872445184
R² Score: 0.8690690352618394
Mean Squared Error: 7314861554.317848
Root Mean Squared Error: 85526.96390213935
R² Score: 0.5415917674362611
Mean Squared Error: 4250450342.5900874
Root Mean Squared Error: 65195.47793052895
R² Score: 0.7781100192668879


In [78]:
cat_nu_cols = ["manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_log_cols = ["manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_root_cols = ["manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

In [79]:
# mean_value = data['make_target_encoded'].mean()
# data['make_target_encoded'].fillna(mean_value, inplace=True)
# data['no_of_owners'].fillna(1, inplace=True)


In [80]:
# 设置显示最大行数
pd.set_option('display.max_rows', None)  # None表示显示所有行，或者设置一个具体的数字

# 设置显示最大列数
pd.set_option('display.max_columns', None)  # None表示显示所有列，或者设置一个具体的数字

# 也可以设置显示的宽度以适应你的屏幕
pd.set_option('display.width', 1000)  # 根据你的屏幕大小调整这个数值
missing_values = X_train.isnull().sum()
missing_percentages = (missing_values / len(X_train)) * 100
missing_info = pd.DataFrame({'Number of missing values': missing_values, 'Percentage of missing values (%)': missing_percentages})
print(missing_info)

                                 Number of missing values  Percentage of missing values (%)
make                                                 1029                             5.145
model                                                   0                             0.000
manufactured                                            0                             0.000
curb_weight                                             0                             0.000
power                                                   0                             0.000
engine_cap                                              0                             0.000
no_of_owners                                            0                             0.000
depreciation                                            0                             0.000
coe                                                     0                             0.000
road_tax                                                0                       

In [21]:
for li in [cat_nu_cols, cat_log_cols, cat_root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Mean Squared Error: 2056361436.0400398
Root Mean Squared Error: 45347.12158494781
R² Score: 0.8987369549232052
Mean Squared Error: 5156732202.024495
Root Mean Squared Error: 71810.39062715434
R² Score: 0.7166246713038464
Mean Squared Error: 3182860593.3100944
Root Mean Squared Error: 56416.84671541023
R² Score: 0.8382097049874158


In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.output_layer(x)
        return x

class AttentionModel(nn.Module):
    def __init__(self, input_dim):
        super(AttentionModel, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=7)
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = x.unsqueeze(0)
        x, _ = self.attention(x, x, x)  # 注意力机制要求输入、键和值为相同
        x = x.mean(dim=0)  # 取均值或其他汇总操作
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.output_layer(x)
        return x



In [23]:
def train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid):


    # 损失函数和优化器
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 数据转换为torch.Tensor
    X_train_tensor = torch.tensor(X_train[cat_nu_cols].values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    X_valid_tensor = torch.tensor(X_valid[cat_nu_cols].values, dtype=torch.float32)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).view(-1, 1)

    epochs = 50
    for epoch in range(epochs):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

        model.eval()
    
        # 关闭梯度计算
        with torch.no_grad():
            y_test_pred_mlp = model(X_valid_tensor)
    
        mse_test_mlp = mean_squared_error(y_valid_tensor.numpy(), y_test_pred_mlp.numpy())
        rmse_test_mlp = np.sqrt(mse_test_mlp)
        r2_test_mlp = r2_score(y_valid_tensor.numpy(), y_test_pred_mlp.numpy())
    
        #print(f'Network Valid Mean Squared Error: {mse_test_mlp}')
        print(f'Network Valid Root Mean Squared Error: {rmse_test_mlp}')
        #print(f'Network Valid R² Score: {r2_test_mlp}')

In [24]:

input_dim = len(cat_nu_cols)
print(input_dim)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = AttentionModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)


49
Epoch 1/50, Loss: 4441783296.0
Network Valid Root Mean Squared Error: 62345.01953125
Epoch 2/50, Loss: 1860846208.0
Network Valid Root Mean Squared Error: 50882.82421875
Epoch 3/50, Loss: 97548206080.0
Network Valid Root Mean Squared Error: 47278.6640625
Epoch 4/50, Loss: 890583296.0
Network Valid Root Mean Squared Error: 56989.15234375
Epoch 5/50, Loss: 13830218752.0
Network Valid Root Mean Squared Error: 46056.5546875
Epoch 6/50, Loss: 1297167616.0
Network Valid Root Mean Squared Error: 44195.1796875
Epoch 7/50, Loss: 654461504.0
Network Valid Root Mean Squared Error: 47196.54296875
Epoch 8/50, Loss: 995539648.0
Network Valid Root Mean Squared Error: 42680.25
Epoch 9/50, Loss: 443205344.0
Network Valid Root Mean Squared Error: 40867.50390625
Epoch 10/50, Loss: 2198498048.0
Network Valid Root Mean Squared Error: 39249.29296875
Epoch 11/50, Loss: 279367104.0
Network Valid Root Mean Squared Error: 36655.05078125
Epoch 12/50, Loss: 549477312.0
Network Valid Root Mean Squared Error: 36