In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# 忽略警告
import warnings
warnings.filterwarnings('ignore')


In [29]:
# 读取数据集
df_races = pd.read_csv('races.csv')
df_runs = pd.read_csv('runs.csv')

# 合并数据集
df_combined = pd.merge(df_runs, df_races, on='race_id')

In [30]:
# 填补缺失值
df_races.fillna(df_races.mean(), inplace=True)
df_runs.fillna(df_runs.mean(), inplace=True)

# 转换日期格式
df_races['date'] = pd.to_datetime(df_races['date'])

# 增加骑师的总比赛场次和胜利场次列
df_runs['total_races'] = df_runs.groupby('jockey_id').cumcount() + 1
df_runs['total_wins'] = df_runs.groupby('jockey_id')['won'].cumsum()

# 合并数据集
df_combined = pd.merge(df_runs, df_races, on='race_id')

In [59]:
# 定义要使用的特征和标签
features = ['venue', 'config', 'surface', 'distance', 'horse_age', 'horse_country', 'horse_type', 
            'horse_rating', 'declared_weight', 'actual_weight', 'total_races', 'total_wins']
label = 'result'

# 确定需要进行one-hot编码的类别特征
categorical_features = ['venue', 'config', 'surface', 'distance', 'horse_country', 'horse_type']

# 创建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'  # 其余特征保持原样
)

# 先拟合预处理器
preprocessor.fit(df_combined[features])

# 确保所有特征都被正确编码
X = preprocessor.transform(df_combined[features])
print("Shape of X:", X.shape)

# 将稀疏矩阵转换为密集矩阵
X_dense = X.toarray() if hasattr(X, 'toarray') else X

# 创建一个DataFrame来存储预处理后的数据
df_processed = pd.DataFrame(X_dense, columns=preprocessor.get_feature_names_out())
df_processed['race_id'] = df_combined['race_id'].values
df_processed['label'] = df_combined[label].values

Shape of X: (79447, 52)


In [63]:
# 确定每场比赛中最大的马匹数量
max_horses_per_race = 14

# 定义填充函数
def pad_race_data(group, max_horses):
    current_length = len(group)
    if current_length < max_horses:
        padding_length = max_horses - current_length
        pad_features = np.zeros((padding_length, group.shape[1] - 2))  # 减去 race_id 和 label
        pad_df_features = pd.DataFrame(pad_features, columns=group.columns[:-2])
        pad_race_ids = pd.DataFrame({'race_id': [group['race_id'].iloc[0]] * padding_length})
        pad_labels = pd.DataFrame({'label': [0] * padding_length})
        pad_df = pd.concat([pad_df_features, pad_race_ids, pad_labels], axis=1)
        group = pd.concat([group, pad_df], ignore_index=True)
    return group

# 对每场比赛的数据进行填充
df_padded = df_processed.groupby('race_id').apply(pad_race_data, max_horses=max_horses_per_race).reset_index(drop=True)

# 验证填充后的形状
print("Shape of df_padded:", df_padded.shape)

# 重新分离特征和标签
X_padded = df_padded.drop(columns=['race_id', 'label']).values
y_padded = df_padded['label'].values

# 打印填充后的 X 和 y
print("First few rows of X_padded:")
print(X_padded[:5])
print("Shape of X_padded:", X_padded.shape)

print("First few rows of y_padded:")
print(y_padded[:5])
print("Shape of y_padded:", y_padded.shape)

# 将 y_padded 重构为与 X_padded 形状匹配的标签数组
y_padded_repeated = np.repeat(y_padded, 1)  # 保持 y_padded 的形状与 X_padded 一致

# 确保 `y_padded_repeated` 的形状与 `X_padded` 一致
print("Shape of y_padded_repeated:", y_padded_repeated.shape)

Shape of df_padded: (88872, 54)
First few rows of X_padded:
[[0.000e+00 1.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 3.000e+00 6.000e+01 1.020e+03
  1.330e+02 1.000e+00 0.000e+00]
 [0.000e+00 1.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1

In [61]:
# 将数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)

# 计算每个样本的特征数量
num_features = X_padded.shape[1]

# 重塑X_train和X_test的形状以适应LSTM
X_train = X_train.reshape(-1, max_horses_per_race, num_features // max_horses_per_race)
X_test = X_test.reshape(-1, max_horses_per_race, num_features // max_horses_per_race)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

ValueError: cannot reshape array of size 3697044 into shape (14,3)

In [None]:
# 预测与评估
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# 计算准确率和F1分数
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(y_test.flatten(), y_pred.flatten())
f1 = f1_score(y_test.flatten(), y_pred.flatten())

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# 绘制训练过程中的准确率和损失曲线
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

# 混淆矩阵
conf_matrix = confusion_matrix(y_test.flatten(), y_pred.flatten())

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# 保存模型
model.save('horse_racing_prediction_lstm_model.h5')