<a href="https://colab.research.google.com/github/ProEarth/602/blob/master/CS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/flyaflya/fsan830spring2025.git
%cd fsan830spring2025

fatal: destination path 'fsan830spring2025' already exists and is not an empty directory.
/content/fsan830spring2025


In [2]:
!pip install -U numpy pymc pymc-bart arviz xarray matplotlib scikit-learn

Collecting numpy
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pymc
  Using cached pymc-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting pytensor<2.31,>=2.30.2 (from pymc)
  Using cached pytensor-2.30.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10.0 kB)


Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
Successfully installed pandas-2.2.2


In [2]:
import pandas as pd
import numpy as np
import pymc as pm
from pymc_bart import BART
from glob import glob
import os
import xml.etree.ElementTree as ET


In [3]:
# ====== 1. 批量解析 pastPerformanceData 特征 ======
def parse_past_performance(xml_path):
    import xml.etree.ElementTree as ET
    tree = ET.parse(xml_path)
    root = tree.getroot()
    records = []
    for race in root.findall('Race'):
        race_number = race.findtext('RaceNumber')
        # 遍历所有 Starters 节点
        for starters in race.findall('Starters'):
            # 针对每个马（即每个Starters下的字段）
            record = {'RaceNumber': race_number}
            # 逐字段判断并记录
            for item in starters:
                if item.tag == 'Horse':
                    record['HorseName'] = item.findtext('HorseName')
                    record['YearOfBirth'] = item.findtext('YearOfBirth')
                    record['FoalingArea'] = item.findtext('FoalingArea')
                elif item.tag in ['PostPosition', 'ProgramNumber', 'WeightCarried', 'Odds']:
                    record[item.tag] = item.text
                elif item.tag == 'Trainer':
                    record['Trainer'] = (item.findtext('LastName') or '') + (item.findtext('FirstName') or '')
                elif item.tag == 'Jockey':
                    record['Jockey'] = (item.findtext('LastName') or '') + (item.findtext('FirstName') or '')
            # 最基本的字段必须有，否则跳过
            if 'HorseName' in record and record['HorseName']:
                for k in record:
                    if isinstance(record[k], str):
                        record[k] = record[k].strip()
                records.append(record)
    return pd.DataFrame(records)

# ====== 2. 批量解析 resultsData 赛果 ======
def parse_results(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    records = []
    for race in root.findall('.//RACE'):
        race_number = race.get('NUMBER')
        for entry in race.findall('ENTRY'):
            record = {
                'RaceNumber': race_number,
                'HorseName': entry.findtext('NAME'),
                'OfficialFinish': entry.findtext('OFFICIAL_FIN'),
                'FinishTime': entry.findtext('FINISH_TIME'),
                'SpeedRating': entry.findtext('SPEED_RATING'),
                'DollarOdds': entry.findtext('DOLLAR_ODDS'),
            }
            for k in record:
                if isinstance(record[k], str):
                    record[k] = record[k].strip()
            records.append(record)
    return pd.DataFrame(records)

# ====== 3. 读取所有XML并合并 ======
pp_dir = 'data/rawDataForTraining/pastPerformanceData'
res_dir = 'data/rawDataForTraining/resultsData'

pp_files = glob(os.path.join(pp_dir, '*.xml'))
res_files = glob(os.path.join(res_dir, '*.xml'))

print(f"共发现PastPerformance xml: {len(pp_files)} 份，Result xml: {len(res_files)} 份")

pp_dfs = [parse_past_performance(f) for f in pp_files]
res_dfs = [parse_results(f) for f in res_files]

df_feat = pd.concat(pp_dfs, ignore_index=True)
df_label = pd.concat(res_dfs, ignore_index=True)

print("特征集 shape:", df_feat.shape, "标签集 shape:", df_label.shape)

# 主键统一处理为字符串去空格
for df in [df_feat, df_label]:
    df['HorseName'] = df['HorseName'].astype(str).str.lower().str.strip()
    df['RaceNumber'] = df['RaceNumber'].astype(str).str.strip()

# 合并
df_full = pd.merge(df_feat, df_label, on=['RaceNumber', 'HorseName'], how='inner', suffixes=('_pp', '_res'))
print("最终训练集 shape:", df_full.shape)

# 赔率处理（如有“5/2”样式需转成小数）
def odds_str_to_float(odds):
    if pd.isna(odds):
        return None
    try:
        return float(odds)
    except:
        if '/' in str(odds):
            try:
                num, den = odds.split('/')
                return float(num) / float(den)
            except:
                return None
        return None

df_full['Odds_float'] = df_full['Odds'].apply(odds_str_to_float)
df_full['DollarOdds'] = pd.to_numeric(df_full['DollarOdds'], errors='coerce')
df_full['OfficialFinish'] = pd.to_numeric(df_full['OfficialFinish'], errors='coerce')
df_full['FinishTime'] = pd.to_numeric(df_full['FinishTime'], errors='coerce')
df_full['SpeedRating'] = pd.to_numeric(df_full['SpeedRating'], errors='coerce')
df_full['WeightCarried'] = pd.to_numeric(df_full['WeightCarried'], errors='coerce')

# 保存
df_full.to_csv('final_supervised_training_set.csv', index=False)
print(df_full.head())
print("可用于建模的特征：", df_full.columns.tolist())

共发现PastPerformance xml: 18 份，Result xml: 18 份
特征集 shape: (1757, 10) 标签集 shape: (1482, 6)
最终训练集 shape: (1509, 14)
  RaceNumber        HorseName YearOfBirth FoalingArea PostPosition  \
0          1    mr. commander        2020          AR            1   
1          1        supercool        2018          KY            2   
2          1       hail allen        2020          KY            3   
3          1  charlie whiskey        2020          KY            4   
4          1         chowmein        2020          KY            5   

  ProgramNumber  WeightCarried          Trainer            Jockey  Odds  \
0             1            118     GarciaGenaro   CorralesGerardo   5/2   
1             2            124     FoleyGregory  ArrietaFrancisco   3/1   
2             3            118      BlairJordan    GutierrezReylu  10/1   
3             4            118     HartmanChris   MurrillMitchell   4/1   
4             5            118  KordenbrockMatt    BejaranoRafael   8/1   

   OfficialFini

In [16]:
# 自选特征（你可以用更多特征，建议先用Odds_float和WeightCarried测试）
feature_cols = ['Odds_float', 'WeightCarried', 'SpeedRating']  # 你可以增删
'''
feature_cols = [
    'YearOfBirth',
    'FoalingArea',
    'PostPosition',
    'WeightCarried',
    'Trainer',
    'Jockey',
    'SpeedRating',
    'Odds_float'
]
'''
X = df_full[feature_cols].fillna(0).values
y = df_full['OfficialFinish'].values


In [18]:

# 字段名和BRISNET字段对应的csv索引
column_indices = [
    2,    # RaceNumber
    44,   # HorseName
    45,   # YearOfBirth
    56,   # FoalingArea
    3,    # PostPosition
    42,   # ProgramNumber
    50,   # WeightCarried
    27,   # Trainer
    32,   # Jockey
    515,  # Odds (第一场历史比赛的赔率)
    615,  # OfficialFinish (第一场历史比赛的名次)
    1035, # FinishTime (第一场历史比赛的完赛时间)
    845,  # SpeedRating (第一场历史比赛的评分)
    43,   # DollarOdds
    # Odds_float 需要你自己用Odds列加工
]
# 只保留你需要的三个特征
feature_cols = ['Odds_float', 'WeightCarried', 'SpeedRating']

# 对应在BRISNET字典中的索引
column_indices = [
    515,  # Odds
    50,   # WeightCarried
    845   # SpeedRating
]

# 对应的列名（顺序要和上面索引一致）
columns = ['Odds', 'WeightCarried', 'SpeedRating']

# 读取原始测试集
X_test = pd.read_csv('data/rawDataForPrediction/CDX0426.csv', header=None, usecols=column_indices)
X_test.columns = columns

# 处理 Odds_float，只保留这三列
def odds_str_to_float(s):
    try:
        if pd.isna(s):
            return None
        if '-' in str(s):
            a, b = str(s).split('-')
            return float(a) / float(b)
        return float(s)
    except:
        return None

X_test['Odds_float'] = X_test['Odds'].apply(odds_str_to_float)

# 最终只保留 feature_cols
X_test_final = X_test[feature_cols]

print(X_test_final)

    Odds_float  WeightCarried  SpeedRating
0        118.0          17.21         81.0
1        118.0          14.10         73.0
2        118.0          33.57         57.0
3        118.0          49.95         64.0
4        113.0           8.45         79.0
..         ...            ...          ...
89       118.0           6.71         76.0
90       118.0           9.60         76.0
91       118.0          10.87         79.0
92       118.0         104.12         80.0
93       113.0           1.70         64.0

[94 rows x 3 columns]


In [39]:

with pm.Model() as model:
  μ = BART("μ", X, y)
  σ = pm.HalfNormal("σ", sigma=1.0)
  y_obs = pm.Normal("y_obs", mu=μ, sigma=σ, observed=y)
  trace = pm.sample(1000, tune=1000, target_accept=0.95, cores=1, random_seed=42)
  print("模型训练完毕！")

  μ_test = BART("μ_test", X_test_final, None)
  ppc = pm.fast_sample_posterior_predictive(trace, var_names=["μ_test"])
  y_pred_samples = ppc["μ_test"]  # shape: (n_draws, n_test)
  y_pred_mean = y_pred_samples.mean(axis=0)
  y_pred_std = y_pred_samples.std(axis=0)


X_test_final['prediction'] = y_pred_mean
X_test_final['prediction_std'] = y_pred_std
X_test_final.to_csv('prediction_output.csv', index=False)
print(X_test_final)

Output()

模型训练完毕！


AttributeError: 'NoneType' object has no attribute 'astype'