In [10]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from ipywidgets import Button, Output, VBox, Dropdown, Text
from IPython.display import display

In [11]:
df1 = pd.read_csv('matches2.csv')
df2 = pd.read_csv('matches.csv')
df2 = df2.drop(columns=['Unnamed: 0','notes','match report','formation','pk','pkatt'])
df1 = df1.drop(columns=['Unnamed: 0','notes','match report','formation','opp formation','pk','pkatt'])

matches = pd.concat([df1,df2], axis=0)

In [12]:
matches['date'] = pd.to_datetime(matches['date'], errors='coerce')
matches.dropna(subset=['date'])


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,poss,attendance,captain,referee,sh,sot,dist,fk,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,62.0,30014.0,Virgil van Dijk,Tim Robinson,18.0,5.0,14.8,0.0,2025.0,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,62.0,60017.0,Virgil van Dijk,Stuart Attwell,19.0,8.0,13.6,1.0,2025.0,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,47.0,73738.0,Virgil van Dijk,Anthony Taylor,11.0,3.0,13.4,0.0,2025.0,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,68.0,60344.0,Virgil van Dijk,Michael Oliver,14.0,5.0,14.9,0.0,2025.0,Liverpool
4,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,58.0,60347.0,Virgil van Dijk,Tony Harrington,19.0,12.0,16.6,0.0,2025.0,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,34.0,,John Egan,Andre Marriner,8.0,1.0,17.4,0.0,2021.0,Sheffield United
1385,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,50.0,,John Egan,Simon Hooper,7.0,0.0,11.4,1.0,2021.0,Sheffield United
1386,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,38.0,,John Egan,Jonathan Moss,10.0,3.0,17.0,0.0,2021.0,Sheffield United
1387,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,50.0,10000.0,John Egan,Robert Jones,11.0,1.0,16.0,1.0,2021.0,Sheffield United


In [13]:

matches['result_encoded'] = matches['result'].map({'W':0, 'L':1, 'D':1})
matches['venue_encoded'] = matches['venue'].map({'Home':0, 'Away':1})
le = LabelEncoder()
matches['referee'] = le.fit_transform(matches['referee'])


In [14]:
def calculate_last_5_stats(team, date):
    """
    计算给定球队在某日期前的过去五场比赛表现，若不足五场，则返回实际值
    """
    past_matches = matches[
        ((matches['team'] == team)) & (matches['date'] < date)
    ].tail(5)

    total_matches = len(past_matches)
    wins = ((past_matches['team'] == team) & (past_matches['result'] == 'W')).sum()

    # 计算射门和射正数
    shots_for = past_matches['sh'].sum()
    shots_on_target_for = past_matches['sot'].sum()
    avg_shots_for = shots_for / total_matches if total_matches > 0 else 0
    avg_shots_on_target_for = shots_on_target_for / total_matches if total_matches > 0 else 0

    gf = past_matches['gf'].sum()
    ga = past_matches['ga'].sum()
    avg_gf = gf / total_matches if total_matches > 0 else 0
    avg_ga = ga / total_matches if total_matches > 0 else 0

    return wins, avg_shots_for, avg_shots_on_target_for, avg_gf, avg_ga

matches = matches.sort_values(by=['team', 'date']).reset_index(drop=True)
matches[['Last5Wins', 'Last5AvgSh', 'Last5AvgSot', 'Last5AvgGf', 'Last5AvgGa']] = matches.apply(
    lambda row: pd.Series(calculate_last_5_stats(row['team'], row['date'])),
    axis=1
)



In [15]:
merged_df = pd.merge(
    matches[matches["venue"] == "Home"],
    matches[matches["venue"] == "Away"],
    on=["date", "time", "referee", "comp", "day"],
    suffixes=("_home", "_away")
)

merged_df = merged_df[[
    "date", "referee",
    "team_home", "team_away", "result_encoded_home",
    "gf_home", "ga_home", "sh_home", "sot_home",'Last5Wins_home', 'Last5AvgSh_home', 'Last5AvgSot_home', 'Last5AvgGf_home', 'Last5AvgGa_home',
    "gf_away", "ga_away", "sh_away", "sot_away",'Last5Wins_away', 'Last5AvgSh_away', 'Last5AvgSot_away', 'Last5AvgGf_away', 'Last5AvgGa_away'
]]

In [16]:


merged_df['date'] = pd.to_datetime(merged_df['date'])

# 对每个 team_home 取 date 最新的行
latest_rows = merged_df.loc[merged_df.groupby('team_home')['date'].idxmax()]

merged_df = merged_df.drop(columns=["team_home", "team_away"])


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

feature_cols = [
    "referee",
    'Last5Wins_home', 'Last5AvgSh_home', 'Last5AvgSot_home', 'Last5AvgGf_home', 'Last5AvgGa_home',
    'Last5Wins_away', 'Last5AvgSh_away', 'Last5AvgSot_away', 'Last5AvgGf_away', 'Last5AvgGa_away'
]

# 目标列
target_col = "result_encoded_home"

x = merged_df[feature_cols]
y = merged_df[target_col]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)
x_test

# 初始化随机森林模型
rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=4, min_samples_split=10, random_state=10, max_depth=10)

# 训练模型
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.2f}")

print("分类报告:")
print(classification_report(y_test, y_pred))

测试集准确率: 0.64
分类报告:
              precision    recall  f1-score   support

         0.0       0.58      0.44      0.50       102
         1.0       0.67      0.78      0.72       148

    accuracy                           0.64       250
   macro avg       0.63      0.61      0.61       250
weighted avg       0.64      0.64      0.63       250



In [20]:


# Function to save team data based on team names
def save_team_data(home_team_name, away_team_name, referee_name):
    home_rows = latest_rows[latest_rows['team_home'] == home_team_name]
    away_rows = latest_rows[latest_rows['team_home'] == away_team_name]

    if not home_rows.empty and not away_rows.empty:
        home_selected_row = home_rows.iloc[0]
        away_selected_row = away_rows.iloc[0]
        result_df = pd.DataFrame([{
            "referee": referee_name,
            "Last5Wins_home": home_selected_row["Last5Wins_home"], 
            "Last5AvgSh_home": home_selected_row["Last5AvgSh_home"], 
            "Last5AvgSot_home": home_selected_row["Last5AvgSot_home"], 
            "Last5AvgGf_home": home_selected_row["Last5AvgGf_home"], 
            "Last5AvgGa_home": home_selected_row["Last5AvgGa_home"], 
            "Last5Wins_away": away_selected_row["Last5Wins_home"], 
            "Last5AvgSh_away": away_selected_row["Last5AvgSh_home"], 
            "Last5AvgSot_away": away_selected_row["Last5AvgSot_home"], 
            "Last5AvgGf_away": away_selected_row["Last5AvgGf_home"], 
            "Last5AvgGa_away": away_selected_row["Last5AvgGa_home"]
        }])
        return result_df
    else:
        return pd.DataFrame()


# Output widget to display results
output = Output()

# Button click event handler
def on_button_click(home_team_name, away_team_name, referee_name):
    output.clear_output()
    with output:
        if home_team_name and away_team_name and referee_name:
            result_df = save_team_data(home_team_name, away_team_name, referee_name)
            if not result_df.empty:
                print("Matched data")
                display(result_df)
                
                # Use the model to predict the result
                predicted_result = rf.predict(result_df)
                result_label = "w" if predicted_result[0] == 0 else "D/L"
                print(f"Prediction result: {result_label}")
            else:
                print(f"No rows found for home team '{home_team_name}' or away team '{away_team_name}'.")


# Dropdowns for selecting home and away teams
home_team_dropdown = Dropdown(options=latest_rows['team_home'].unique(), description="Home Team:")
away_team_dropdown = Dropdown(options=latest_rows['team_home'].unique(), description="Away Team:")

# Text widget for inputting referee name
referee_input = Text(description="Referee:", placeholder="Enter referee name")

# Button to trigger prediction
predict_button = Button(description="Predict Result")
predict_button.on_click(lambda x: on_button_click(home_team_dropdown.value, away_team_dropdown.value, referee_input.value))

# Display the widgets
display(VBox([home_team_dropdown, away_team_dropdown, referee_input, predict_button, output]))


VBox(children=(Dropdown(description='Home Team:', options=('Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford…