In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [30]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [31]:
matches = pd.read_csv("premierleague_team_data.csv")

In [32]:
test_matches = pd.read_csv("premierleague_test_team_data.csv")

In [33]:
matches["Team"].value_counts()

Team
ManchesterCity           190
ManchesterUnited         190
TottenhamHotspur         190
Liverpool                190
Chelsea                  190
Arsenal                  190
Everton                  190
CrystalPalace            190
Southampton              190
SwanseaCity              190
StokeCity                190
WestHamUnited            190
WestBromwichAlbion       190
NewcastleUnited          152
LeicesterCity            152
Burnley                  114
Bournemouth              114
Watford                  114
Sunderland               114
HullCity                 114
AstonVilla               114
NorwichCity               76
BrightonandHoveAlbion     38
HuddersfieldTown          38
Middlesbrough             38
QueensParkRangers         38
Fulham                    38
CardiffCity               38
Name: count, dtype: int64

In [34]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [35]:
test_matches["Date"] = pd.to_datetime(test_matches["Date"])

In [36]:
 matches["Venue_code"] = matches["Venue"].astype("category").cat.codes

In [37]:
test_matches["Venue_code"] = test_matches["Venue"].astype("category").cat.codes

In [38]:
matches["Opp_code"] = matches["Opponent"].astype("category").cat.codes 

In [39]:
test_matches["Opp_code"] = test_matches["Opponent"].astype("category").cat.codes 

In [40]:
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [41]:
test_matches["Hour"] = test_matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [42]:
matches["Day_code"] = matches["Date"].dt.dayofweek

In [43]:
test_matches["Day_code"] = test_matches["Date"].dt.dayofweek

In [44]:
matches["Target"]=(matches["Result"] == "W").astype("int")

In [45]:
test_matches["Target"]=(test_matches["Result"] == "W").astype("int")

In [48]:
# Load training and testing data
train = matches
test = test_matches

# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
rf.fit(train[predictors], train["Target"])

# Make predictions
preds = rf.predict(test[predictors])
    
# Test the model on each yearly split
results = []

 # Calculate metrics
accuracy = accuracy_score(test["Target"], preds)
precision = precision_score(test["Target"], preds, average="weighted")
print(f"Metrics: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")


Metrics: Accuracy = 0.5875, Precision = 0.5730


In [61]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Load training and testing data
train = matches
test = test_matches


# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Create yearly splits for testing
test_splits = {}
for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
    yearly_data = test[test['Date'].dt.year == year]
    if not yearly_data.empty:
        test_splits[year] = yearly_data

# Output available years
print("Available years for testing data:", list(test_splits.keys()))

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

rf.fit(train[predictors], train["Target"])

# Test the model on each yearly split
results = []
for year, test_data in test_splits.items():
    # Make predictions
    preds = rf.predict(test_data[predictors])
    
    # Calculate metrics
    accuracy = accuracy_score(test_data["Target"], preds)
    precision = precision_score(test_data["Target"], preds, average="weighted")
    
    # Store results
    results.append({"Year": year, "Accuracy": accuracy, "Precision": precision})

    print(f"Year {year}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
print("\nOverall Results:")
print(results_df)

Available years for testing data: [2019, 2020, 2021, 2022, 2023]
Year 2019: Accuracy = 0.5980, Precision = 0.5796
Year 2020: Accuracy = 0.5952, Precision = 0.5831
Year 2021: Accuracy = 0.5784, Precision = 0.5616
Year 2022: Accuracy = 0.5859, Precision = 0.5747
Year 2023: Accuracy = 0.5856, Precision = 0.5699

Overall Results:
   Year  Accuracy  Precision
0  2019  0.597990   0.579615
1  2020  0.595238   0.583052
2  2021  0.578431   0.561561
3  2022  0.585873   0.574678
4  2023  0.585648   0.569919


In [62]:
grouped_matches = matches.groupby("Team")

In [63]:
grouped_test_matches = test_matches.groupby("Team")

In [64]:
def rolling_averages(group, cols, new_cols):
    # Sort the group by the "Date" column to ensure chronological order
    group = group.sort_values("Date")
    
    # Calculate rolling averages over a window of 3 rows, excluding the current row
    # (e.g., for row N, it computes the average of rows N-1, N-2, and N-3)
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    
    # Assign the calculated rolling averages to new columns in the group
    group[new_cols] = rolling_stats
    
    # Drop rows where the new rolling average columns contain NaN values
    # (occurs when there aren't enough previous rows to calculate the average)
    group = group.dropna(subset=new_cols)
    
    return group

In [65]:
cols = [ "GF","GA","Sh", "SoT", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [66]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [67]:
results = []
for team, group in matches.groupby("Team"):
    # Apply rolling averages to each group
    result = rolling_averages(group, cols, new_cols)
    results.append(result)

# Concatenate all results into one DataFrame
matches_rolling = pd.concat(results)

In [68]:
test_results = []
for team, group in test_matches.groupby("Team"):
    # Apply rolling averages to each group
    result = rolling_averages(group, cols, new_cols)
    results.append(result)

# Concatenate all results into one DataFrame
test_matches_rolling = pd.concat(results)

In [69]:
matches_rolling.index = range(matches_rolling.shape[0])

In [70]:
test_matches_rolling.index = range(test_matches_rolling.shape[0])

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score
import pandas as pd

def calculate_rolling_features(df, columns, window_size):
    for col in columns:
        df[f"{col}_rolling"] = df[col].rolling(window=window_size, min_periods=1).mean()
    return df

def make_yearly_predictions_with_rolling(train, test, predictors, rolling_predictors):
    train['Date'] = pd.to_datetime(train['Date'], errors='coerce')
    test['Date'] = pd.to_datetime(test['Date'], errors='coerce')
    train = train.dropna(subset=['Date']).sort_values(by='Date')
    test = test.dropna(subset=['Date']).sort_values(by='Date')

    all_predictors = predictors + rolling_predictors

    yearly_results = []
    for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
        test_year = test[test['Date'].dt.year == year]
        if not test_year.empty:
            print(f"\nTesting on year: {year} with {len(test_year)} matches.")
            rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
            rf.fit(train[all_predictors], train["Target"])
            preds = rf.predict(test_year[all_predictors])
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)
            combined = pd.DataFrame({
                "Year": year,
                "Actual": test_year["Target"],
                "Prediction": preds,
                "Precision": [precision] * len(test_year),
                "Accuracy": [accuracy] * len(test_year)
            })
            yearly_results.append(combined)
            print(f"Year {year}: Precision = {precision:.4f}, Accuracy = {accuracy:.4f}")

    results = pd.concat(yearly_results, ignore_index=True)
    return results


train = matches
test = test_matches

# Define predictors
static_predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rolling_predictors = ["GF_rolling", "GA_rolling", "Sh_rolling", "SoT_rolling", "PK_rolling", "PKatt_rolling"]

# Generate rolling features
feature_columns = ["GF", "GA", "Sh", "SoT", "PK", "PKatt"]
train = calculate_rolling_features(train, feature_columns, window_size=5)
test = calculate_rolling_features(test, feature_columns, window_size=5)

# Run the prediction function
results = make_yearly_predictions_with_rolling(train, test, static_predictors, rolling_predictors)

# Display results
print("\nYearly Predictions and Metrics:")
print(results.groupby("Year")[["Precision", "Accuracy"]].mean())




Testing on year: 2019 with 398 matches.
Year 2019: Precision = 0.7096, Accuracy = 0.7161

Testing on year: 2020 with 672 matches.
Year 2020: Precision = 0.6878, Accuracy = 0.6964

Testing on year: 2021 with 816 matches.
Year 2021: Precision = 0.6987, Accuracy = 0.7047

Testing on year: 2022 with 722 matches.
Year 2022: Precision = 0.6952, Accuracy = 0.7008

Testing on year: 2023 with 432 matches.
Year 2023: Precision = 0.7043, Accuracy = 0.7106

Yearly Predictions and Metrics:
      Precision  Accuracy
Year                     
2019   0.709591  0.716080
2020   0.687821  0.696429
2021   0.698709  0.704657
2022   0.695169  0.700831
2023   0.704331  0.710648
