In [15]:
import pandas as pd

selected_game_features = pd.read_csv('selected_game_features.csv')

selected_game_features.head()


Unnamed: 0,Game ID,Game Date,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,Home Team Recent 3 Months Win Rate,Away Team Recent 3 Months Win Rate
0,29900423,2000-01-02,5.0,1,False,3.0,1,False,0.4,0.7,0.769231,0.4375
1,29900427,2000-01-03,5.0,2,False,5.0,1,False,0.6,0.4,0.461538,0.666667
2,29900426,2000-01-03,4.0,1,False,8.0,0,False,0.8,0.4,0.357143,0.785714
3,29900429,2000-01-03,5.0,2,False,11.0,0,False,0.7,0.8,0.785714,0.75
4,29900428,2000-01-03,5.0,2,False,13.0,0,False,0.3,0.2,0.166667,0.25


In [16]:
wl = pd.read_csv('game_with_home_wl.csv')
wl.head()

Unnamed: 0,Game ID,Game Date,wl_home
0,29900423,2000-01-02,W
1,29900427,2000-01-03,L
2,29900426,2000-01-03,W
3,29900429,2000-01-03,W
4,29900428,2000-01-03,L


In [17]:
# 选择需要计算差值的feature
features_to_diff = [
    'Days Since Last Game',
    'Games Last 7 Days',
    'Is Back-to-Back',
    'Team Recent 10 Win Rate',
    'Team Recent 3 Months Win Rate'
]

# 构建主队和客队的列名
home_cols = [
    'Home Days Since Last Game',
    'Home Games Last 7 Days',
    'Home Is Back-to-Back',
    'Home Team Recent 10 Win Rate',
    'Home Team Recent 3 Months Win Rate'
]
away_cols = [
    'Away Days Since Last Game',
    'Away Games Last 7 Days',
    'Away Is Back-to-Back',
    'Away Team Recent 10 Win Rate',
    'Away Team Recent 3 Months Win Rate'
]

# 新增差值特征
for home_col, away_col, feat in zip(home_cols, away_cols, features_to_diff):
    diff_col = f'Diff {feat}'
    # If the columns are boolean, cast to int before subtraction
    if selected_game_features[home_col].dtype == 'bool' and selected_game_features[away_col].dtype == 'bool':
        selected_game_features[diff_col] = selected_game_features[home_col].astype(int) - selected_game_features[away_col].astype(int)
    else:
        selected_game_features[diff_col] = selected_game_features[home_col] - selected_game_features[away_col]

selected_game_features.head()

Unnamed: 0,Game ID,Game Date,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,Home Team Recent 3 Months Win Rate,Away Team Recent 3 Months Win Rate,Diff Days Since Last Game,Diff Games Last 7 Days,Diff Is Back-to-Back,Diff Team Recent 10 Win Rate,Diff Team Recent 3 Months Win Rate
0,29900423,2000-01-02,5.0,1,False,3.0,1,False,0.4,0.7,0.769231,0.4375,2.0,0,0,0.3,0.331731
1,29900427,2000-01-03,5.0,2,False,5.0,1,False,0.6,0.4,0.461538,0.666667,0.0,1,0,-0.2,-0.205128
2,29900426,2000-01-03,4.0,1,False,8.0,0,False,0.8,0.4,0.357143,0.785714,-4.0,1,0,-0.4,-0.428571
3,29900429,2000-01-03,5.0,2,False,11.0,0,False,0.7,0.8,0.785714,0.75,-6.0,2,0,0.1,0.035714
4,29900428,2000-01-03,5.0,2,False,13.0,0,False,0.3,0.2,0.166667,0.25,-8.0,2,0,-0.1,-0.083333


In [18]:
selected_game_features = selected_game_features.drop(columns=['Game Date'])

In [19]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(selected_game_features, test_size=0.2, random_state=42)

print(f"Train set shape: {train_set.shape}")
print(f"Test set shape: {test_set.shape}")

Train set shape: (24351, 16)
Test set shape: (6088, 16)


In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


# 提取目标变量（主队胜负，W为1，L为0）
train_y = train_set['Game ID'].map(dict(zip(wl['Game ID'], wl['wl_home']))).map({'W': 1, 'L': 0})
test_y = test_set['Game ID'].map(dict(zip(wl['Game ID'], wl['wl_home']))).map({'W': 1, 'L': 0})

# 特征
feature_cols = [
    'Home Days Since Last Game', 'Home Games Last 7 Days', 'Home Is Back-to-Back',
    'Away Days Since Last Game', 'Away Games Last 7 Days', 'Away Is Back-to-Back',
    'Away Team Recent 10 Win Rate', 'Home Team Recent 10 Win Rate',
    'Home Team Recent 3 Months Win Rate', 'Away Team Recent 3 Months Win Rate',
    'Diff Days Since Last Game', 'Diff Games Last 7 Days', 'Diff Is Back-to-Back',
    'Diff Team Recent 10 Win Rate', 'Diff Team Recent 3 Months Win Rate'
]

train_X = train_set[feature_cols]
test_X = test_set[feature_cols]

# 训练模型
rf_clf = RandomForestRegressor(random_state=42)
rf_clf.fit(train_X, train_y)

# 预测与评估
test_pred = rf_clf.predict(test_X)
test_r2 = r2_score(test_y, test_pred)
print(f"Test R2: {test_r2:.4f}")

In [None]:
from sklearn.metrics import accuracy_score

# 预测为主队胜（概率>0.5为1，否则为0）
pred_label = (test_pred > 0.5).astype(int)
true_label = test_y

# 计算模型正确率
model_acc = accuracy_score(true_label, pred_label)
print(f"RandomForestRegressor: {model_acc:.4f}")

# Baseline: 全部预测为主队输（0）或赢（1），取最大比例
baseline_acc = max((true_label == 0).mean(), (true_label == 1).mean())
print(f"Baseline正确率: {baseline_acc:.4f}")


RandomForestRegressor: 0.6082
Baseline正确率: 0.5931


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

# 初始化模型
gbr = GradientBoostingRegressor(random_state=42)

# 训练模型
gbr.fit(train_X, train_y)

# 在测试集上进行预测
y_pred_gbr = gbr.predict(test_X)

# 计算 R2 分数
test_r2 = r2_score(test_y, y_pred_gbr)
print(f"Test R2: {test_r2:.4f}")

Test R2: 0.0747


In [None]:
from sklearn.metrics import accuracy_score

# GradientBoostingRegressor的分类预测
gbr_pred_label = (y_pred_gbr > 0.5).astype(int)
gbr_acc = accuracy_score(true_label, gbr_pred_label)

print(f"GradientBoosting: {gbr_acc:.4f}")
print(f"Baseline正确率: {baseline_acc:.4f}")



GradientBoosting: 0.6422
Baseline正确率: 0.5931
