In [15]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# 读取处理后的数据集
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_valid = pd.read_csv('data/processed/X_valid.csv')
y_valid = pd.read_csv('data/processed/y_valid.csv') 
X_train_full = pd.read_csv('data/processed/X_train_full.csv') 
y_train_full = pd.read_csv('data/processed/y_train_full.csv') 
X_test = pd.read_csv('data/processed/X_test.csv')

print('数据集读取完成')
print(f'训练集大小: {X_train.shape}')
print(f'验证集大小: {X_valid.shape}')
print(f'全量训练集大小: {X_train_full.shape}')
print(f'测试集大小: {X_test.shape}')


数据集读取完成
训练集大小: (20000, 74)
验证集大小: (5000, 74)
全量训练集大小: (25000, 74)
测试集大小: (10000, 74)


In [16]:
# 直接定义列名配置
del_cols = ['listing_id', 'original_reg_date', 'opc_scheme', 'lifespan', 'eco_category', 'indicative_price']
text_cols = ['title', 'description', 'features', 'accessories']
date_cols = ['reg_date']
numeric_cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 
                'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month',
                'text_brand_popularity_score', 'text_model_value_score', 'text_condition_score',
                'text_feature_rarity_score', 'text_performance_score', 'text_sentiment_score']
log_cols = ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 
            'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month']
root_cols = ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 
             'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month']
categorical_cols = ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']

# 更新变换列，添加GPT特征
cat_nu_cols = [
    "manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", 
    "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded",
    "text_brand_popularity_score", "text_model_value_score", "text_condition_score",
    "text_feature_rarity_score", "text_performance_score", "text_sentiment_score",
    "-", "almost new car", "coe car", "consignment car", "direct owner sale", 
    "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", 
    "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", 
    "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", 
    "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", 
    "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", 
    "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", 
    "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", 
    "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", 
    "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"
]

cat_log_cols = [
    "manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", 
    "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", 
    "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", 
    "text_condition_score", "text_feature_rarity_score", "text_performance_score", 
    "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", 
    "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", 
    "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", 
    "sgcarmart warranty cars", "sta evaluated car", "vintage cars", 
    "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", 
    "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", 
    "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", 
    "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", 
    "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", 
    "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", 
    "fuel_type_nan", "transmission_manual", "year", "month"
]

cat_root_cols = [
    "manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", 
    "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", 
    "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", 
    "text_condition_score", "text_feature_rarity_score", "text_performance_score", 
    "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", 
    "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", 
    "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", 
    "sgcarmart warranty cars", "sta evaluated car", "vintage cars", 
    "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", 
    "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", 
    "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", 
    "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", 
    "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", 
    "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", 
    "fuel_type_nan", "transmission_manual", "year", "month"
]

# 丢弃log和root变换的结果
X_train = X_train[cat_nu_cols]
X_valid = X_valid[cat_nu_cols]
X_test = X_test[cat_nu_cols]
X_train_full = X_train_full[cat_nu_cols]

print(f'训练集: {X_train.shape}')
print(f'全量集: {X_train_full.shape}')
print(f'测试集: {X_test.shape}')

训练集: (20000, 55)
全量集: (25000, 55)
测试集: (10000, 55)


In [17]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from umap.umap_ import UMAP  # 正确的导入方式

# 加载BERT向量
bert_train_vectors = np.load('data/processed/train_vectors.npy')
bert_valid_vectors = np.load('data/processed/valid_vectors.npy')
bert_train_full_vectors = np.load('data/processed/train_full_vectors.npy')
bert_test_vectors = np.load('data/processed/test_vectors.npy')

# BERT降维
scaler = StandardScaler()

# UMAP参数设置
umap = UMAP(
    n_components=8,
    n_neighbors=20,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)

# umap = UMAP(
#     n_components=16,
#     n_neighbors=30,
#     min_dist=0.3,
#     metric='cosine',
#     random_state=42
# )

# umap = UMAP(
#     n_components=24,
#     n_neighbors=50,
#     min_dist=0.5,
#     metric='cosine',
#     random_state=42
# )

# 对BERT向量进行UMAP降维
bert_train_scaled = scaler.fit_transform(bert_train_vectors)
bert_train_umap = umap.fit_transform(bert_train_scaled)

# 对验证集和测试集应用相同的转换
bert_valid_scaled = scaler.transform(bert_valid_vectors)
bert_valid_umap = umap.transform(bert_valid_scaled)

bert_test_scaled = scaler.transform(bert_test_vectors)
bert_test_umap = umap.transform(bert_test_scaled)

# 对完整训练集进行转换
bert_train_full_scaled = scaler.transform(bert_train_full_vectors)
bert_train_full_umap = umap.transform(bert_train_full_scaled)

# 拼接特征
X_train_combined = np.hstack((X_train[cat_nu_cols].values, bert_train_umap))
X_valid_combined = np.hstack((X_valid[cat_nu_cols].values, bert_valid_umap))
X_test_combined = np.hstack((X_test[cat_nu_cols].values, bert_test_umap))
X_train_full_combined = np.hstack((X_train_full[cat_nu_cols].values, bert_train_full_umap))


# 打印维度信息
print("特征维度:")
print(f"原始特征: {X_train[cat_nu_cols].shape[1]}")
print(f"UMAP特征: {bert_train_umap.shape[1]}")
print(f"组合特征: {X_train_combined.shape[1]}")

  warn(


特征维度:
原始特征: 55
UMAP特征: 8
组合特征: 63


In [18]:
print(X_train_full_combined.shape)
print(X_test_combined.shape)
X_train = X_train_combined
X_valid = X_valid_combined
X_test = X_test_combined
X_train_full = X_train_full_combined

(25000, 63)
(10000, 63)


## Baseline

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        
        # Initializing weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.output_layer(x)
        return x

In [20]:

def train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, lr=0.001, wd=0):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).view(-1, 1)

    epochs = 50
    best_rmse = float('inf')
    patience = 10
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            y_test_pred_mlp = model(X_valid_tensor)
            mse_test_mlp = mean_squared_error(y_valid_tensor.numpy(), y_test_pred_mlp.numpy())
            rmse_test_mlp = np.sqrt(mse_test_mlp)

        # 更新最佳RMSE
        if rmse_test_mlp < best_rmse:
            best_rmse = rmse_test_mlp
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    # 训练流程结束后打印最佳RMSE
    print(f'Best Valid RMSE: {best_rmse:.4f}')

input_dim = X_train_combined.shape[1]
print(input_dim)
        
# input_dim = len(cat_nu_cols)
# print(input_dim)
        


63


In [21]:

# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.1)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.01)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.0001)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.00001)

Best Valid RMSE: 29849.7988
Early stopping triggered.
Best Valid RMSE: 24896.9609
Best Valid RMSE: 29919.1641
Best Valid RMSE: 42777.7188


In [22]:
# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0)
# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.0001)
# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.001)
# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.01)
# model = MLPModel(input_dim)
# train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.1)

In [23]:
class MLPWithBertModel(nn.Module):
    def __init__(self, input_dim, bert_dim=8, reduced_dim=16):
        super(MLPWithBertModel, self).__init__()
        # Assuming the BERT output is at the last part of the input
        self.bert_processor = nn.Sequential(
            nn.Linear(bert_dim, reduced_dim),
            nn.ReLU()
        )
        
        # New input dimension after concatenating reduced BERT output
        new_input_dim = input_dim - bert_dim + reduced_dim
        
        self.layer1 = nn.Linear(new_input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        
        # Initializing weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
    
    def forward(self, x):
        # Assuming x is the input where the last 1024 elements are the BERT vector
        bert_vector = x[:, -8:]  # Extract the last 1024 dimensions
        other_features = x[:, :-8]  # Extract all other features
        
        # Process the BERT vector
        processed_bert = self.bert_processor(bert_vector)
        
        # Concatenate the processed BERT output with other features
        x = torch.cat((other_features, processed_bert), dim=1)
        
        # Feed through the subsequent layers
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.output_layer(x)
        return x


In [14]:
model = MLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.01)
model = MLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001)
model = MLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.0001)
model = MLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.00001)

Early stopping triggered.
Best Valid RMSE: 30926.9043
Best Valid RMSE: 24532.4688
Best Valid RMSE: 28926.2031
Best Valid RMSE: 43807.6602


## Other NN Models

In [24]:
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = MLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

Best Valid RMSE: 28641.8184
Early stopping triggered.
Best Valid RMSE: 25722.6543


KeyboardInterrupt: 

In [None]:
class MLPDropoutModel(nn.Module):
    def __init__(self, input_dim, dropout_rate=0.5):
        super(MLPDropoutModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.dropout(x)
        x = self.output_layer(x)
        return x

In [None]:
model = MLPDropoutModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = MLPDropoutModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = MLPDropoutModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = MLPDropoutModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

In [None]:
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

In [None]:
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.0001)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.001)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.01)
model = MLPDropoutModel(input_dim, 0.1)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid,0.001,0.1)

In [28]:
class DeeperMLPModel(nn.Module):
    def __init__(self, input_dim):
        super(DeeperMLPModel, self).__init__()
        # Increasing the depth with more layers
        self.layer1 = nn.Linear(input_dim, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 64)
        self.layer5 = nn.Linear(64, 64)
        self.layer6 = nn.Linear(64, 32)
        self.layer7 = nn.Linear(32, 16)
        self.output_layer = nn.Linear(16, 1)
        
        # ReLU activation
        self.relu = nn.ReLU()
        
        # Initializing weights with Kaiming initialization suitable for ReLU
        self._initialize_weights()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.relu(self.layer7(x))
        x = self.output_layer(x)
        return x
    
    def _initialize_weights(self):
        # Apply Kaiming He initialization to all linear layers in the model
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    m.bias.data.fill_(0.01)



In [None]:
model = DeeperMLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.00001)
model = DeeperMLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = DeeperMLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = DeeperMLPModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

In [None]:
class DeeperMLPWithBertModel(nn.Module):
    def __init__(self, input_dim, bert_dim=8, reduced_bert_dim=128):
        super(DeeperMLPWithBertModel, self).__init__()
        
        # Processing the BERT vector
        self.bert_processor = nn.Sequential(
            nn.Linear(bert_dim, reduced_bert_dim),
            nn.ReLU()
        )
        
        # New input dimension after reducing the BERT vector and concatenating it back
        new_input_dim = input_dim - bert_dim + reduced_bert_dim

        # Increasing the depth with more layers
        self.layer1 = nn.Linear(new_input_dim, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 64)
        self.layer5 = nn.Linear(64, 64)
        self.layer6 = nn.Linear(64, 32)
        self.layer7 = nn.Linear(32, 16)
        self.output_layer = nn.Linear(16, 1)
        
        # ReLU activation
        self.relu = nn.ReLU()
        
        # Initializing weights with Kaiming initialization suitable for ReLU
        self._initialize_weights()

    def forward(self, x):
        # Split the input into BERT vector and other features
        bert_vector = x[:, -8:]  # Assuming BERT vector is the last 1024 elements
        other_features = x[:, :-8]  # The rest of the features

        # Process the BERT vector
        processed_bert = self.bert_processor(bert_vector)

        # Concatenate the processed BERT vector with other features
        x = torch.cat((other_features, processed_bert), dim=1)

        # Sequentially process through all layers
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.relu(self.layer7(x))
        x = self.output_layer(x)
        return x
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    m.bias.data.fill_(0.01)

In [None]:
model = DeeperMLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.00001)
model = DeeperMLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = DeeperMLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = DeeperMLPWithBertModel(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

In [None]:
class DeepResidualMLP(nn.Module):
    def __init__(self, input_dim):
        super(DeepResidualMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 256)
        self.output_layer = nn.Linear(256, 1)
        
        # ReLU activation
        self.relu = nn.ReLU()
        
        # Initializing weights with He initialization suitable for ReLU
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        identity = x
        x = self.relu(self.layer1(x))
        
        # Residual block 1
        out = self.relu(self.layer2(x))
        x = out + x  # Changed from 'out += x' to 'x = out + x'
    
        # Residual block 2
        out = self.relu(self.layer3(x))
        x = out + x  # Changed from 'x += out' to 'x = out + x'
    
        # Residual block 3
        out = self.relu(self.layer4(x))
        x = out + x  # Changed from 'out += x' to 'x = out + x'
    
        # Residual block 4
        out = self.relu(self.layer5(x))
        x = out + x  # Changed from 'x += out' to 'x = out + x'
    
        x = self.output_layer(x)
        return x




In [None]:

model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.1)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.1)

In [None]:
class DeepResidualMLP(nn.Module):
    def __init__(self, input_dim):
        super(DeepResidualMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 256)
        self.output_layer = nn.Linear(256, 1)
        
        # ReLU activation
        self.relu = nn.ReLU()
        
        # Initializing weights with He initialization suitable for ReLU
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        # Input pass
        identity = x
        x = self.relu(self.layer1(x))
        
        # Residual block 1
        out = self.relu(self.layer2(x))
        out += x  # Adding input after the block
        
        # Residual block 2
        x = self.relu(self.layer3(out))
        x += out  # Adding input from the previous block
        
        # Residual block 3
        out = self.relu(self.layer4(x))
        out += x  # Adding input from the previous block
        
        # Residual block 4
        x = self.relu(self.layer5(out))
        x += out  # Adding input from the previous block

        # Output pass
        x = self.output_layer(x)
        return x

model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)

In [None]:
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = DeepResidualMLP(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.feature_dim = feature_dim
        
        self.key_layer = nn.Linear(feature_dim, feature_dim, bias=False)
        self.query_layer = nn.Linear(feature_dim, feature_dim, bias=False)

    def forward(self, x):
        query = self.query_layer(x).unsqueeze(1)  # Adding batch dimension
        key = self.key_layer(x).unsqueeze(-1)  # Adding an extra dimension for bmm
        
        # Compute attention scores and apply softmax
        scores = torch.bmm(query, key)  # Should work as both are 3D now
        weights = F.softmax(scores, dim=-1)
        
        # Apply weights to the original input features, using batch matrix multiplication
        attended = torch.bmm(weights, x.unsqueeze(1))  # x also needs to be 3D
        return attended.squeeze(1)  # Remove the extra dimension to match expected output shape





class DeepResidualMLPWithAttention(nn.Module):
    def __init__(self, input_dim):
        super(DeepResidualMLPWithAttention, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 256)
        self.attention = Attention(256)  # Attention layer after layer5
        self.output_layer = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        
        # Initializing weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        identity = x
        x = self.relu(self.layer1(x))
        
        out = self.relu(self.layer2(x))
        out = out + x  # Use out-of-place operation

        x = self.relu(self.layer3(out))
        x = x + out  # Use out-of-place operation

        out = self.relu(self.layer4(x))
        out = out + x  # Use out-of-place operation

        x = self.relu(self.layer5(out))
        x = x + out  # Use out-of-place operation

        # Apply attention
        x = self.attention(x)
        x = self.output_layer(x)
        return x



In [None]:
model = DeepResidualMLPWithAttention(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid)
model = DeepResidualMLPWithAttention(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001)
model = DeepResidualMLPWithAttention(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.001, 0.01)
model = DeepResidualMLPWithAttention(input_dim)
train_network(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, 0.0001, 0.01)

## Prediction on test set

In [25]:
def train_network_and_get_result(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, X_test, lr=0.001, wd=0):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    epochs = 50
    best_rmse = float('inf')
    best_model = None
    patience = 10
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            y_valid_pred = model(X_valid_tensor).numpy().flatten()
            mse_valid = mean_squared_error(y_valid_tensor.numpy(), y_valid_pred)
            rmse_valid = np.sqrt(mse_valid)

        if rmse_valid < best_rmse:
            best_rmse = rmse_valid
            best_model = model
            torch.save(best_model.state_dict(), 'best_model.pth')  # Save the best model
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    print(f'Best Valid RMSE: {best_rmse:.4f}')

    # Saving validation predictions to a CSV file
    valid_predictions_df = pd.DataFrame({
        'Id': range(len(y_valid_pred)),
        'Predicted': y_valid_pred
    })
    valid_predictions_df.to_csv('data/nn_valid.csv', index=False)
    print("Validation results saved to data/nn_valid.csv")

    # Saving test predictions to a CSV file if the best model was found
    if best_model:
        best_model.eval()
        with torch.no_grad():
            y_test_pred = best_model(X_test_tensor).numpy().flatten()

        test_predictions_df = pd.DataFrame({
            'Id': range(len(y_test_pred)),
            'Predicted': y_test_pred
        })
        test_predictions_df.to_csv('data/nn_test.csv', index=False)
        print("Test results saved to data/nn_test.csv")



In [27]:
model = MLPWithBertModel(input_dim)
train_network_and_get_result(model, cat_nu_cols, X_train, y_train, X_valid, y_valid, X_test,0.001)

Best Valid RMSE: 25162.6660
Validation results saved to data/nn_valid.csv
Test results saved to data/nn_test.csv


In [None]:
# # 选择表现最好的模型进行最终训练和预测
# best_model = DeeperMLPModel(input_dim)

# print("使用全量数据训练最终模型...")
# # 转换数据为tensor
# X_train_full_tensor = torch.tensor(X_train_full[cat_nu_cols].values, dtype=torch.float32)
# y_train_full_tensor = torch.tensor(y_train_full.values, dtype=torch.float32).view(-1, 1)
# train_dataset = TensorDataset(X_train_full_tensor, y_train_full_tensor)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# # 训练模型
# criterion = nn.MSELoss()
# optimizer = optim.Adam(best_model.parameters(), lr=0.001)

# epochs = 50
# for epoch in range(epochs):
#     best_model.train()
#     for inputs, targets in train_loader:
#         optimizer.zero_grad()
#         outputs = best_model(inputs)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
#     print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# print("生成测试集预测结果...")
# best_model.eval()
# X_test_tensor = torch.tensor(X_test[cat_nu_cols].values, dtype=torch.float32)
# with torch.no_grad():
#     test_predictions = best_model(X_test_tensor).numpy()

# # 创建预测结果DataFrame
# predictions_df = pd.DataFrame({
#     'Id': range(len(test_predictions)),
#     'Predicted': test_predictions.flatten()
# })

# # 保存预测结果
# predictions_df.to_csv('data/nn_test.csv', index=False)
# print("预测结果已保存到 data/nn_test.csv")