In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# 读取处理后的数据集
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_valid = pd.read_csv('data/processed/X_valid.csv')
y_valid = pd.read_csv('data/processed/y_valid.csv') 
X_train_full = pd.read_csv('data/processed/X_train_full.csv') 
y_train_full = pd.read_csv('data/processed/y_train_full.csv') 
X_test = pd.read_csv('data/processed/X_test.csv')

print('数据集读取完成')
print(f'训练集大小: {X_train.shape}')
print(f'验证集大小: {X_valid.shape}')
print(f'全量训练集大小: {X_train_full.shape}')
print(f'测试集大小: {X_test.shape}')


数据集读取完成
训练集大小: (20000, 68)
验证集大小: (5000, 68)
全量训练集大小: (25000, 68)
测试集大小: (10000, 68)


In [2]:
# 从JSON文件读取列名配置
with open('./data/columns.json', 'r') as f:
    columns_dict = json.load(f)

# 从字典中读取列名
del_cols = columns_dict['del_cols']
text_cols = columns_dict['text_cols'] 
date_cols = columns_dict['date_cols']
numeric_cols = columns_dict['numeric_cols']
log_cols = columns_dict['log_cols']
root_cols = columns_dict['root_cols']
categorical_cols = columns_dict['categorical_cols']

# 定义变换列
cat_nu_cols = ["manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_log_cols = ["manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_root_cols = ["manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

## Baseline

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.output_layer(x)
        return x

In [4]:
def train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid):
    # 损失函数和优化器
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 数据转换为torch.Tensor
    X_train_tensor = torch.tensor(X_train[cat_nu_cols].values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    X_valid_tensor = torch.tensor(X_valid[cat_nu_cols].values, dtype=torch.float32)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).view(-1, 1)

    epochs = 50
    for epoch in range(epochs):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

        model.eval()
    
        # 关闭梯度计算
        with torch.no_grad():
            y_test_pred_mlp = model(X_valid_tensor)
    
        mse_test_mlp = mean_squared_error(y_valid_tensor.numpy(), y_test_pred_mlp.numpy())
        rmse_test_mlp = np.sqrt(mse_test_mlp)
        r2_test_mlp = r2_score(y_valid_tensor.numpy(), y_test_pred_mlp.numpy())
    
        #print(f'Network Valid Mean Squared Error: {mse_test_mlp}')
        print(f'Network Valid Root Mean Squared Error: {rmse_test_mlp}')
        #print(f'Network Valid R² Score: {r2_test_mlp}')
        
input_dim = len(cat_nu_cols)
print(input_dim)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)

49


Epoch 1/50, Loss: 634415744.0
Network Valid Root Mean Squared Error: 48356.27734375
Epoch 2/50, Loss: 42138972160.0
Network Valid Root Mean Squared Error: 53370.203125
Epoch 3/50, Loss: 788850560.0
Network Valid Root Mean Squared Error: 45003.44921875
Epoch 4/50, Loss: 641270528.0
Network Valid Root Mean Squared Error: 41428.8046875
Epoch 5/50, Loss: 6707839488.0
Network Valid Root Mean Squared Error: 53092.97265625
Epoch 6/50, Loss: 1888291584.0
Network Valid Root Mean Squared Error: 42499.9453125
Epoch 7/50, Loss: 997212736.0
Network Valid Root Mean Squared Error: 36083.43359375
Epoch 8/50, Loss: 349108288.0
Network Valid Root Mean Squared Error: 44228.4140625
Epoch 9/50, Loss: 303337152.0
Network Valid Root Mean Squared Error: 36910.09765625
Epoch 10/50, Loss: 254180272.0
Network Valid Root Mean Squared Error: 41045.546875
Epoch 11/50, Loss: 119595704.0
Network Valid Root Mean Squared Error: 31311.439453125
Epoch 12/50, Loss: 617467072.0
Network Valid Root Mean Squared Error: 31832.

## Other NN Models

## Prediction on test set

In [21]:
# 选择表现最好的模型进行最终训练和预测
best_model = MLPModel(input_dim)

print("使用全量数据训练最终模型...")
# 转换数据为tensor
X_train_full_tensor = torch.tensor(X_train_full[cat_nu_cols].values, dtype=torch.float32)
y_train_full_tensor = torch.tensor(y_train_full.values, dtype=torch.float32).view(-1, 1)
train_dataset = TensorDataset(X_train_full_tensor, y_train_full_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 训练模型
criterion = nn.MSELoss()
optimizer = optim.Adam(best_model.parameters(), lr=0.001)

epochs = 50
for epoch in range(epochs):
    best_model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = best_model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

print("生成测试集预测结果...")
best_model.eval()
X_test_tensor = torch.tensor(X_test[cat_nu_cols].values, dtype=torch.float32)
with torch.no_grad():
    test_predictions = best_model(X_test_tensor).numpy()

# 创建预测结果DataFrame
predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions.flatten()
})

# 保存预测结果
predictions_df.to_csv('data/predictions.csv', index=False)
print("预测结果已保存到 data/predictions.csv")

使用全量数据训练最终模型...
生成测试集预测结果...
预测结果已保存到 data/predictions.csv
