In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier


# 忽略警告
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 读取数据集
df_races = pd.read_csv('races.csv')
df_runs = pd.read_csv('runs.csv')

# 填补缺失值
df_races.fillna(df_races.mean(), inplace=True)
df_runs.fillna(df_runs.mean(), inplace=True)

# 转换日期格式
df_races['date'] = pd.to_datetime(df_races['date'])

# 增加骑师的总比赛场次和胜利场次列
df_runs['total_races'] = df_runs.groupby('jockey_id').cumcount() + 1
df_runs['total_wins'] = df_runs.groupby('jockey_id')['won'].cumsum()

In [None]:
# 查看各列的唯一值
unique_values = {
    'venue': df_races['venue'].unique(),
    'config': df_races['config'].unique(),
    'surface': df_races['surface'].unique(),
    'distance': df_races['distance'].unique(),
    'horse_country': df_runs['horse_country'].unique(),
    'horse_type': df_runs['horse_type'].unique()
}

for column, values in unique_values.items():
    print(f"Unique values in {column}: {values}")

In [None]:
# 定义需要转换的类别变量
categorical_columns_races = ['venue', 'config', 'surface', 'distance']
categorical_columns_runs = ['horse_country', 'horse_type']

# 创建一个LabelEncoder字典
label_encoders = {}

# 对df_races中的类别变量进行编码
for column in categorical_columns_races:
    label_encoders[column] = LabelEncoder()
    df_races[column] = label_encoders[column].fit_transform(df_races[column].astype(str))

# 对df_runs中的类别变量进行编码
for column in categorical_columns_runs:
    label_encoders[column] = LabelEncoder()
    df_runs[column] = label_encoders[column].fit_transform(df_runs[column].astype(str))
    
df_combined = pd.merge(df_runs, df_races, on='race_id')
df_combined['win'] = (df_combined[label] == 1).astype(int)
df_combined = df_combined[features + ['win']]

In [None]:
# 提取并合并所需的列
features = ['race_id', 'venue', 'config', 'surface', 'distance', 'horse_age', 'horse_country', 'horse_type', 
            'horse_rating', 'declared_weight', 'actual_weight', 'total_races', 'total_wins']
label = 'win'

# 填充数据
max_horses_per_race = 14
def pad_race_data(group, max_horses):
    current_length = len(group)
    if current_length < max_horses:
        padding_length = max_horses - current_length
        pad_features = np.zeros((padding_length, len(features)))  # 创建填充值为0的数组
        pad_df = pd.DataFrame(pad_features, columns=features)
        pad_df['race_id'] = group['race_id'].iloc[0]  # 保持相同的race_id
        group = pd.concat([group, pad_df], ignore_index=True)
    return group

df_padded = df_combined.groupby('race_id').apply(pad_race_data, max_horses=max_horses_per_race).reset_index(drop=True)
df_padded['win'].fillna(0, inplace=True)

In [None]:
df_padded.to_csv('output/combined.csv', index='false')

In [None]:
df_padded

In [None]:
# 分离特征和标签
X = df_padded[features].values
y = df_padded['win'].values

# Check for NaN values in features and labels
print(f"NaN values in X: {np.isnan(X).sum()}")
print(f"NaN values in y: {np.isnan(y).sum()}")

# Ensure data types are correct
X = X.astype(np.float32)
y = y.astype(np.float32)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
# Initialize XGBoost classifier without CUDA
xgb_model = XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, tree_method='hist', gpu_id=-1)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_xgb_model = grid_search.best_estimator_

# Predictions
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluation
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
confusion_xgb = confusion_matrix(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print(f"XGBoost Test Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"XGBoost Test F1 Score: {f1_xgb:.2f}")
print("XGBoost Confusion Matrix:\n", confusion_xgb)
print("XGBoost Classification Report:\n", report_xgb)

Fitting 3 folds for each of 72 candidates, totalling 216 fits




XGBoostError: [12:30:30] /opt/rapids/src/xgboost/src/data/array_interface.cu:44: Check failed: err == cudaGetLastError() (0 vs. 2) : 
Stack trace:
  [bt] (0) /usr/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x85) [0x7fc6ec061c35]
  [bt] (1) /usr/lib/libxgboost.so(xgboost::ArrayInterfaceHandler::IsCudaPtr(void const*)+0xe0) [0x7fc6ec721ed0]
  [bt] (2) /usr/lib/libxgboost.so(xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView)+0x140) [0x7fc6ec2684f0]
  [bt] (3) /usr/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0x11d) [0x7fc6ec0422ed]
  [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7fc870c24e2e]
  [bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7fc870c21493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x7fc870e983e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x7fc870e97a00]
  [bt] (8) /usr/bin/python(_PyObject_MakeTpCall+0x25b) [0x55a0f5f19a7b]



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time=  55.9s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=5, n_estimators=100, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=7, n_estimators=100, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  56.0s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=5, n_estimators=100, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=7, n_estimators=100, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  56.5s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=7, n_estimators=100, subsample=1.0; total time=   0.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=2

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# 计算类权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train.flatten())
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# 使用梯度提升决策树进行训练，增加class_weight参数
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train, sample_weight=np.vectorize(class_weight_dict.get)(y_train.flatten()))

# 预测
y_pred_gb = gb_model.predict(X_test)

# 评估模型
accuracy_gb = accuracy_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)
confusion_gb = confusion_matrix(y_test, y_pred_gb)
report_gb = classification_report(y_test, y_pred_gb)

print(f"Gradient Boosting Test Accuracy: {accuracy_gb * 100:.2f}%")
print(f"Gradient Boosting Test F1 Score: {f1_gb:.2f}")
print("Gradient Boosting Confusion Matrix:\n", confusion_gb)
print("Gradient Boosting Classification Report:\n", report_gb)

In [None]:
# 计算准确率和F1分数
accuracy = np.mean(y_pred == y_test)
f1 = f1_score(y_test.flatten(), y_pred.flatten())

# 绘制训练过程中的准确率和损失曲线
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# 保存模型
model.save('horse_racing_prediction_model.h5')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')