In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# 读取数据
races_df = pd.read_csv('races.csv')
runs_df = pd.read_csv('runs.csv')

# 生成骑师的总比赛场次和胜利场次列
runs_df['total_races'] = runs_df.groupby('jockey_id').cumcount() + 1
runs_df['total_wins'] = runs_df.groupby('jockey_id')['won'].cumsum()

# 合并数据
merged_df = pd.merge(runs_df, races_df, on='race_id')

# 筛选需要的特征列
features = ['race_id', 'venue', 'config', 'surface', 'distance', 'horse_age', 'horse_country', 
            'horse_type', 'horse_rating', 'declared_weight', 'actual_weight', 'total_races', 'total_wins']
merged_df = merged_df[features + ['result']]

In [3]:
# 找出每个 race_id 对应的最大马匹数
max_horses_per_race = 14

# 获取所有的 race_id
all_race_ids = merged_df['race_id'].unique()

# 创建一个新的 DataFrame 用于存储补全后的数据
complete_df = pd.DataFrame()

for race_id in all_race_ids:
    race_data = merged_df[merged_df['race_id'] == race_id]
    num_horses = len(race_data)
    
    if num_horses < max_horses_per_race:
        # 计算需要补全的马匹数
        num_to_add = max_horses_per_race - num_horses
        
        # 生成补全数据
        to_add = pd.DataFrame(0, index=np.arange(num_to_add), columns=race_data.columns)
        to_add['race_id'] = race_id
        
        # 合并补全数据
        race_data = pd.concat([race_data, to_add], ignore_index=True)
    
    # 将补全后的数据添加到 complete_df 中
    complete_df = pd.concat([complete_df, race_data], ignore_index=True)

# 处理缺失值（示例：用均值填充）
complete_df.fillna(complete_df.mean(numeric_only=True), inplace=True)

In [4]:
# 确保类别变量为字符串类型
categorical_cols = ['venue', 'config', 'surface', 'horse_country', 'horse_type']
for col in categorical_cols:
    complete_df[col] = complete_df[col].astype(str)

# 保存类别变量编码器
encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(complete_df[categorical_cols])

# 保存标准化器
scaler = StandardScaler()
scaled_features = scaler.fit_transform(complete_df.drop(columns=['result', 'race_id', *categorical_cols]))

# 定义特征和目标变量
X = np.hstack([scaled_features, encoded_cats])
y = complete_df['result'].values

In [5]:
# 按比赛分组
grouped = complete_df.groupby('race_id')
grouped_X = [torch.tensor(X[complete_df['race_id'] == race_id], dtype=torch.float32) for race_id in all_race_ids]
grouped_y = [torch.tensor(y[complete_df['race_id'] == race_id], dtype=torch.float32) for race_id in all_race_ids]

# 创建数据加载器
train_dataset = TensorDataset(torch.stack(grouped_X), torch.stack(grouped_y))
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # 每次处理一个比赛

In [6]:
# 定义神经网络模型
class HorseRankNet(nn.Module):
    def __init__(self, input_size):
        super(HorseRankNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# 初始化模型
input_size = X.shape[1]
model = HorseRankNet(input_size)

# 定义自定义损失函数和优化器
class RankLoss(nn.Module):
    def __init__(self):
        super(RankLoss, self).__init__()

    def forward(self, y_pred, y_true):
        return torch.mean(torch.abs(y_pred - y_true))
    
criterion = RankLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [7]:
# 逆编译函数
def inverse_transform(X_batch, scaler, encoder):
    num_features = scaler.mean_.shape[0]
    categorical_features = encoder.categories_
    
    # 逆标准化数值特征
    X_num = scaler.inverse_transform(X_batch[:, :num_features])
    
    # 逆编码类别特征
    X_cat = encoder.inverse_transform(X_batch[:, num_features:])
    
    return np.hstack([X_num, X_cat])

In [8]:
# 训练模型
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(X_batch.squeeze(0))
        loss = criterion(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # 打印每个 epoch 的损失
    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

Epoch [1/50], Loss: 2.7783
Epoch [2/50], Loss: 2.7361
Epoch [3/50], Loss: 2.7261
Epoch [4/50], Loss: 2.7186
Epoch [5/50], Loss: 2.7121
Epoch [6/50], Loss: 2.7069
Epoch [7/50], Loss: 2.7028
Epoch [8/50], Loss: 2.6973
Epoch [9/50], Loss: 2.6931
Epoch [10/50], Loss: 2.6887
Epoch [11/50], Loss: 2.6834
Epoch [12/50], Loss: 2.6802
Epoch [13/50], Loss: 2.6772
Epoch [14/50], Loss: 2.6725
Epoch [15/50], Loss: 2.6686
Epoch [16/50], Loss: 2.6655
Epoch [17/50], Loss: 2.6618
Epoch [18/50], Loss: 2.6580
Epoch [19/50], Loss: 2.6543
Epoch [20/50], Loss: 2.6515
Epoch [21/50], Loss: 2.6485
Epoch [22/50], Loss: 2.6467
Epoch [23/50], Loss: 2.6406
Epoch [24/50], Loss: 2.6366
Epoch [25/50], Loss: 2.6321
Epoch [26/50], Loss: 2.6294
Epoch [27/50], Loss: 2.6251
Epoch [28/50], Loss: 2.6211
Epoch [29/50], Loss: 2.6172
Epoch [30/50], Loss: 2.6146
Epoch [31/50], Loss: 2.6084
Epoch [32/50], Loss: 2.6063
Epoch [33/50], Loss: 2.5988
Epoch [34/50], Loss: 2.5966
Epoch [35/50], Loss: 2.5931
Epoch [36/50], Loss: 2.5881
E

In [9]:
# 评估模型
model.eval()
with torch.no_grad():
    total_loss = 0
    for X_batch, y_batch in train_loader:
        outputs = model(X_batch.squeeze(0))
        loss = criterion(outputs.squeeze(), y_batch)
        total_loss += loss.item()
    print(f'Test Mean Absolute Error: {total_loss / len(train_loader):.4f}')

Test Mean Absolute Error: 2.4915
