# 一个简单的线性回归模型

In [8]:
import os
# 创建用于存储数据的文件夹
if not os.path.exists('Premier_League_data/predicted'):
    os.makedirs('Premier_League_data/predicted')

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# 读取之前存储的CSV文件
all_seasons_df = pd.read_csv("Premier_League_data/all_seasons_data_relevant.csv")
remaining_matches_df = pd.read_csv("Premier_League_data/remaining_matches_updated.csv")

# 计算每个球队的平均数据
team_stats = all_seasons_df.groupby("HomeTeam").mean()
team_stats = team_stats.reset_index()

# 准备剩余比赛的特征数据
unique_remaining_matches = remaining_matches_df.drop_duplicates(subset=["Home Team", "Away Team"])

remaining_matches_features = unique_remaining_matches.merge(
    team_stats, left_on="Home Team", right_on="HomeTeam", how="left").merge(
    team_stats, left_on="Away Team", right_on="HomeTeam", how="left", suffixes=("_home", "_away"))

# 只选取我们关心的特征
features = ["FTHG", "FTAG", "HS", "AS", "HF", "AF", "HC", "AC"]
remaining_matches_features = remaining_matches_features[[f"{feat}_home" for feat in features] + [f"{feat}_away" for feat in features]]
remaining_matches_features.columns = features * 2

# 填充剩余比赛特征数据的缺失值
remaining_matches_features.fillna(0, inplace=True)

# 训练线性回归模型
original_features = ['FTHG', 'FTAG', 'HS', 'AS', 'HF', 'AF', 'HC', 'AC',
                     'FTHG', 'FTAG', 'HS', 'AS', 'HF', 'AF', 'HC', 'AC']

# 使用均值填充 NaN 值
all_seasons_df_filled = all_seasons_df[original_features].fillna(all_seasons_df[original_features].mean())
target_filled = all_seasons_df[["FTHG", "FTAG"]].fillna(all_seasons_df[["FTHG", "FTAG"]].mean())

lr = LinearRegression()
lr.fit(all_seasons_df_filled, target_filled)

# 预测剩余比赛的比分
unique_remaining_matches_predictions = lr.predict(remaining_matches_features)

# 四舍五入预测结果并将负数设为0
rounded_predictions = np.round(unique_remaining_matches_predictions).clip(min=0)

# 将预测结果添加到剩余比赛数据中
unique_remaining_matches["Predicted Home Goals"] = rounded_predictions[:, 0]
unique_remaining_matches["Predicted Away Goals"] = rounded_predictions[:, 1]

# 将预测结果映射回原始剩余比赛数据
remaining_matches_df = remaining_matches_df.merge(
    unique_remaining_matches[["Home Team", "Away Team", "Predicted Home Goals", "Predicted Away Goals"]],
    on=["Home Team", "Away Team"], how="left")

# 保存包含预测比分的剩余比赛数据
remaining_matches_df.to_csv("Premier_League_data/predicted/predicted_scores.csv", index=False)


  team_stats = all_seasons_df.groupby("HomeTeam").mean()


# XGBoost回归模型进行预测

In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 加载数据
data_path = "Premier_League_data/all_seasons_data_relevant.csv"
all_seasons_df = pd.read_csv(data_path)

# 计算平均进球数
all_seasons_df['HomeTeam_avg_goals'] = all_seasons_df.groupby('HomeTeam')['FTHG'].transform('mean')
all_seasons_df['AwayTeam_avg_goals'] = all_seasons_df.groupby('AwayTeam')['FTAG'].transform('mean')

# 定义新特征
new_features = ['HomeTeam_avg_goals', 'AwayTeam_avg_goals']

# 删除包含NaN值的行
all_seasons_df = all_seasons_df.dropna(subset=new_features)

# 定义特征和目标变量
X_new = all_seasons_df[new_features]
y_new = all_seasons_df['FTHG'] - all_seasons_df['FTAG']

# 划分训练集和测试集
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# 训练新模型
model_new = XGBRegressor(objective='reg:squarederror')
model_new.fit(X_train_new, y_train_new)

# 使用新模型对测试集进行预测并计算均方误差
y_pred_new = model_new.predict(X_test_new)
mse_new = mean_squared_error(y_test_new, y_pred_new)

print("Mean Squared Error: {:.4f}".format(mse_new))


Mean Squared Error: 3.0206


In [4]:
# 加载 remaining_matches_updated.csv 文件
remaining_matches_path = "Premier_League_data/remaining_matches_updated.csv"
remaining_matches_df = pd.read_csv(remaining_matches_path, skiprows=1, names=["Date", "Home Team", "Away Team"])

# 计算主队和客队的平均进球数
home_team_avg_goals = all_seasons_df.groupby('HomeTeam')['FTHG'].mean()
away_team_avg_goals = all_seasons_df.groupby('AwayTeam')['FTAG'].mean()

# 添加主队和客队的平均进球数到 remaining_matches_df
remaining_matches_df['HomeTeam_avg_goals'] = remaining_matches_df['Home Team'].map(home_team_avg_goals)
remaining_matches_df['AwayTeam_avg_goals'] = remaining_matches_df['Away Team'].map(away_team_avg_goals)

# 使用模型预测比分差值
remaining_matches_features = remaining_matches_df[['HomeTeam_avg_goals', 'AwayTeam_avg_goals']]
remaining_matches_df['score_difference'] = model_new.predict(remaining_matches_features)

# 计算预测的比分
remaining_matches_df['predicted_home_goals'] = round(remaining_matches_df['HomeTeam_avg_goals'] + remaining_matches_df['score_difference'])
remaining_matches_df['predicted_away_goals'] = round(remaining_matches_df['AwayTeam_avg_goals'])

# 保存预测结果到 CSV 文件
predicted_scores_path = "Premier_League_data/predicted/predicted_scores_XGBoost.csv"
columns_to_save = ["Date", "Home Team", "Away Team", "predicted_home_goals", "predicted_away_goals"]
remaining_matches_df[columns_to_save].to_csv(predicted_scores_path, index=False)


# 基于随机森林（Random Forest）回归模型的预测方法

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 加载数据
data_path = "Premier_League_data/all_seasons_data_relevant.csv"
all_seasons_df = pd.read_csv(data_path)

# 计算平均进球数
all_seasons_df['HomeTeam_avg_goals'] = all_seasons_df.groupby('HomeTeam')['FTHG'].transform('mean')
all_seasons_df['AwayTeam_avg_goals'] = all_seasons_df.groupby('AwayTeam')['FTAG'].transform('mean')

# 定义新特征
new_features = ['HomeTeam_avg_goals', 'AwayTeam_avg_goals']

# 删除包含NaN值的行
all_seasons_df = all_seasons_df.dropna(subset=new_features)

# 定义特征和目标变量
X_new = all_seasons_df[new_features]
y_new = all_seasons_df['FTHG'] - all_seasons_df['FTAG']

# 划分训练集和测试集
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# 训练新模型
model_new = RandomForestRegressor(n_estimators=100, random_state=42)
model_new.fit(X_train_new, y_train_new)

# 使用新模型对测试集进行预测并计算均方误差
y_pred_new = model_new.predict(X_test_new)
mse_new = mean_squared_error(y_test_new, y_pred_new)

print("Mean Squared Error: {:.4f}".format(mse_new))

# 加载 remaining_matches_updated.csv 文件
remaining_matches_path = "Premier_League_data/remaining_matches_updated.csv"
remaining_matches_df = pd.read_csv(remaining_matches_path, skiprows=1, names=["Date", "Home Team", "Away Team"])

# 计算主队和客队的平均进球数
home_team_avg_goals = all_seasons_df.groupby('HomeTeam')['FTHG'].mean()
away_team_avg_goals = all_seasons_df.groupby('AwayTeam')['FTAG'].mean()

# 添加主队和客队的平均进球数到 remaining_matches_df
remaining_matches_df['HomeTeam_avg_goals'] = remaining_matches_df['Home Team'].map(home_team_avg_goals)
remaining_matches_df['AwayTeam_avg_goals'] = remaining_matches_df['Away Team'].map(away_team_avg_goals)

# 使用模型预测比分差值
remaining_matches_features = remaining_matches_df[['HomeTeam_avg_goals', 'AwayTeam_avg_goals']]
remaining_matches_df['score_difference'] = model_new.predict(remaining_matches_features)

# 计算预测的比分
remaining_matches_df['predicted_home_goals'] = round(remaining_matches_df['HomeTeam_avg_goals'] + remaining_matches_df['score_difference'])
remaining_matches_df['predicted_away_goals'] = round(remaining_matches_df['AwayTeam_avg_goals'])

# 保存预测结果到 CSV 文件
predicted_scores_path = "Premier_League_data/predicted/predicted_scores_rf.csv"
columns_to_save = ["Date", "Home Team", "Away Team", "predicted_home_goals", "predicted_away_goals"]
remaining_matches_df[columns_to_save].to_csv(predicted_scores_path, index=False)


Mean Squared Error: 3.2074


# 基于神经网络的完整预测代码，使用了 Keras 和 TensorFlow 库

In [6]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install --upgrade tensorflow

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [10]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.11.1


In [25]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 加载数据
data_path = "Premier_League_data/all_seasons_data_relevant.csv"
all_seasons_df = pd.read_csv(data_path)

# 计算平均进球数
all_seasons_df['HomeTeam_avg_goals'] = all_seasons_df.groupby('HomeTeam')['FTHG'].transform('mean')
all_seasons_df['AwayTeam_avg_goals'] = all_seasons_df.groupby('AwayTeam')['FTAG'].transform('mean')

# 定义新特征
new_features = ['HomeTeam_avg_goals', 'AwayTeam_avg_goals']

# 删除包含 NaN 值的行
all_seasons_df = all_seasons_df.dropna(subset=new_features)

# 定义特征和目标变量
X_new = all_seasons_df[new_features]
y_new = all_seasons_df['FTHG'] - all_seasons_df['FTAG']

# 划分训练集和测试集
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train_new_scaled = scaler.fit_transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

# 构建神经网络模型
model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(X_train_new_scaled.shape[1],)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1)
])

# 编译模型
model.compile(optimizer='adam', loss='mean_squared_error')

# 训练模型
model.fit(X_train_new_scaled, y_train_new, epochs=50, verbose=0)

# 预测
y_pred_new = model.predict(X_test_new_scaled)

# 计算均方误差
mse_new = np.mean((y_test_new - y_pred_new.reshape(-1))**2)
print("Mean Squared Error: {:.4f}".format(mse_new))

# 加载 remaining_matches_updated.csv 文件
remaining_matches_path = "Premier_League_data/remaining_matches_updated.csv"
remaining_matches_df = pd.read_csv(remaining_matches_path)

# 计算主队和客队的平均进球数
home_team_avg_goals = all_seasons_df.groupby('HomeTeam')['FTHG'].mean()
away_team_avg_goals = all_seasons_df.groupby('AwayTeam')['FTAG'].mean()

# 添加主队和客队的平均进球数到 remaining_matches_df
remaining_matches_df['HomeTeam_avg_goals'] = remaining_matches_df['Home Team'].map(home_team_avg_goals)
remaining_matches_df['AwayTeam_avg_goals'] = remaining_matches_df['Away Team'].map(away_team_avg_goals)

# 使用模型预测比分差值
remaining_matches_features = remaining_matches_df[['HomeTeam_avg_goals', 'AwayTeam_avg_goals']]
remaining_matches_features_scaled = scaler.transform(remaining_matches_features)
remaining_matches_df['score_difference'] = model.predict(remaining_matches_features_scaled).reshape(-1)

remaining_matches_df['predicted_home_goals'] = (remaining_matches_df['HomeTeam_avg_goals'] + remaining_matches_df['score_difference'] / 2).round()
remaining_matches_df['predicted_away_goals'] = (remaining_matches_df['AwayTeam_avg_goals'] - remaining_matches_df['score_difference'] / 2).round()

# 将负数进球数修正为0
remaining_matches_df['predicted_home_goals'] = np.maximum(remaining_matches_df['predicted_home_goals'], 0)
remaining_matches_df['predicted_away_goals'] = np.maximum(remaining_matches_df['predicted_away_goals'], 0)

# 添加预测结果
remaining_matches_df['predicted_result'] = np.where(remaining_matches_df['predicted_home_goals'] > remaining_matches_df['predicted_away_goals'], 'H',
                                                     np.where(remaining_matches_df['predicted_home_goals'] < remaining_matches_df['predicted_away_goals'], 'A', 'D'))

# 仅显示需要的列
final_predictions = remaining_matches_df[['Date', 'Home Team', 'Away Team', 'predicted_home_goals', 'predicted_away_goals', 'predicted_result']]

# 打印预测结果
# print(final_predictions)

# 保存预测结果到 CSV 文件
final_predictions.to_csv("Premier_League_data/predicted/predicted_results_k_t.csv", index=False)


Mean Squared Error: 2.6634


In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 加载数据
data_path = "Premier_League_data/all_seasons_data_relevant.csv"
all_seasons_df = pd.read_csv(data_path)

# 计算平均进球数
all_seasons_df['HomeTeam_avg_goals'] = all_seasons_df.groupby('HomeTeam')['FTHG'].transform('mean')
all_seasons_df['AwayTeam_avg_goals'] = all_seasons_df.groupby('AwayTeam')['FTAG'].transform('mean')

# 添加主场优势特征
all_seasons_df['HomeTeam_advantage'] = all_seasons_df['HomeTeam_avg_goals'] - all_seasons_df['AwayTeam_avg_goals']

# 添加近期表现特征（过去5场比赛）
all_seasons_df['HomeTeam_recent_performance'] = all_seasons_df.groupby('HomeTeam')['FTHG'].rolling(5).mean().reset_index(0, drop=True)
all_seasons_df['AwayTeam_recent_performance'] = all_seasons_df.groupby('AwayTeam')['FTAG'].rolling(5).mean().reset_index(0, drop=True)

# 定义新特征列表
new_features = ['HomeTeam_avg_goals', 'AwayTeam_avg_goals', 'HomeTeam_advantage', 'HomeTeam_recent_performance', 'AwayTeam_recent_performance']

# 填充 NaN 值为平均值
all_seasons_df[new_features] = all_seasons_df[new_features].fillna(all_seasons_df[new_features].mean())

# 定义特征和目标变量
X_new = all_seasons_df[new_features]
y_new = all_seasons_df['FTHG'] - all_seasons_df['FTAG']

# 划分训练集和测试集
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train_new_scaled = scaler.fit_transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

# 构建神经网络模型
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train_new_scaled.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1)
])

# 编译模型
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# 训练模型
model.fit(X_train_new_scaled, y_train_new, epochs=100, batch_size=32, verbose=0)

# 预测
y_pred_new = model.predict(X_test_new_scaled)

# 计算均方误差
mse_new = np.mean((y_test_new - y_pred_new.reshape(-1))**2)
print("Mean Squared Error: {:.4f}".format(mse_new))

# 加载 remaining_matches_updated.csv 文件
remaining_matches_path = "Premier_League_data/remaining_matches_updated.csv"
remaining_matches_df = pd.read_csv(remaining_matches_path)

# 计算主队和客队的平均进球数
home_team_avg_goals = all_seasons_df.groupby('HomeTeam')['FTHG'].mean()
away_team_avg_goals = all_seasons_df.groupby('AwayTeam')['FTAG'].mean()

# 计算近期表现（过去5场比赛）
home_team_recent_performance = all_seasons_df.groupby('HomeTeam')['FTHG'].rolling(5).mean().groupby(level=0).last().reset_index(0, drop=True)
away_team_recent_performance = all_seasons_df.groupby('AwayTeam')['FTAG'].rolling(5).mean().groupby(level=0).last().reset_index(0, drop=True)

# 使用提供的列名
home_team_column_name = 'Home Team'
away_team_column_name = 'Away Team'

# 使用新的列名
remaining_matches_df['HomeTeam_avg_goals'] = remaining_matches_df[home_team_column_name].map(home_team_avg_goals).fillna(home_team_avg_goals.mean())
remaining_matches_df['AwayTeam_avg_goals'] = remaining_matches_df[away_team_column_name].map(away_team_avg_goals).fillna(away_team_avg_goals.mean())
remaining_matches_df['HomeTeam_advantage'] = remaining_matches_df['HomeTeam_avg_goals'] - remaining_matches_df['AwayTeam_avg_goals']

remaining_matches_df['HomeTeam_recent_performance'] = remaining_matches_df[home_team_column_name].map(home_team_recent_performance.to_dict()).fillna(home_team_recent_performance.mean())
remaining_matches_df['AwayTeam_recent_performance'] = remaining_matches_df[away_team_column_name].map(away_team_recent_performance.to_dict()).fillna(away_team_recent_performance.mean())
remaining_matches_df['Performance_difference'] = remaining_matches_df['HomeTeam_recent_performance'] - remaining_matches_df['AwayTeam_recent_performance']

# 填充 NaN 值为平均值
remaining_matches_df[new_features] = remaining_matches_df[new_features].fillna(remaining_matches_df[new_features].mean())

# 提取特征
X_remaining = remaining_matches_df[new_features]

# 标准化数据
X_remaining_scaled = scaler.transform(X_remaining)

# 预测
y_remaining_pred = model.predict(X_remaining_scaled)

# 将预测结果添加到 remaining_matches_df
remaining_matches_df['predicted_outcome'] = y_remaining_pred

# 计算预测后的主队和客队进球数
remaining_matches_df['predicted_home_goals'] = np.round(remaining_matches_df['HomeTeam_avg_goals'] + y_remaining_pred.flatten() / 2)
remaining_matches_df['predicted_away_goals'] = np.round(remaining_matches_df['AwayTeam_avg_goals'] - y_remaining_pred.flatten() / 2)

# 将负数预测值修正为 0
remaining_matches_df['predicted_home_goals'] = remaining_matches_df['predicted_home_goals'].apply(lambda x: max(0, x))
remaining_matches_df['predicted_away_goals'] = remaining_matches_df['predicted_away_goals'].apply(lambda x: max(0, x))

# 将预测结果保存到文件
remaining_matches_df[['Home Team', 'Away Team', 'predicted_home_goals', 'predicted_away_goals']].to_csv("Premier_League_data/predicted/predicted_results_k_t_new_1.csv", index=False)

Mean Squared Error: 2.0747


# PyTorch构建一个神经网络模型来预测英超联赛比赛的结果