In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [2]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
matches = pd.read_csv("premierleague_team_data.csv")

In [4]:
test_matches = pd.read_csv("premierleague_test_team_data.csv")

In [5]:
matches["Team"].value_counts()

Team
ManchesterCity           190
ManchesterUnited         190
TottenhamHotspur         190
Liverpool                190
Chelsea                  190
Arsenal                  190
Everton                  190
CrystalPalace            190
Southampton              190
SwanseaCity              190
StokeCity                190
WestHamUnited            190
WestBromwichAlbion       190
Sunderland               152
LeicesterCity            152
NewcastleUnited          152
Bournemouth              114
Burnley                  114
HullCity                 114
Watford                  114
AstonVilla               114
NorwichCity               76
BrightonandHoveAlbion     38
HuddersfieldTown          38
Middlesbrough             38
QueensParkRangers         38
Fulham                    38
CardiffCity               38
Name: count, dtype: int64

In [6]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [7]:
test_matches["Date"] = pd.to_datetime(test_matches["Date"])

In [8]:
 matches["Venue_code"] = matches["Venue"].astype("category").cat.codes

In [9]:
test_matches["Venue_code"] = test_matches["Venue"].astype("category").cat.codes

In [10]:
matches["Opp_code"] = matches["Opponent"].astype("category").cat.codes 

In [11]:
test_matches["Opp_code"] = test_matches["Opponent"].astype("category").cat.codes 

In [12]:
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [13]:
test_matches["Hour"] = test_matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [14]:
matches["Day_code"] = matches["Date"].dt.dayofweek

In [15]:
test_matches["Day_code"] = test_matches["Date"].dt.dayofweek

In [16]:
matches["Target"]=(matches["Result"] == "W").astype("int")

In [17]:
test_matches["Target"]=(test_matches["Result"] == "W").astype("int")

In [18]:
# Load training and testing data
train = matches
test = test_matches

# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
rf.fit(train[predictors], train["Target"])

# Make predictions
preds = rf.predict(test[predictors])
    
# Test the model on each yearly split
results = []

 # Calculate metrics
accuracy = accuracy_score(test["Target"], preds)
precision = precision_score(test["Target"], preds, average="weighted")
print(f"Metrics: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")


Metrics: Accuracy = 0.5882, Precision = 0.5740


In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Load training and testing data
train = matches
test = test_matches


# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Create yearly splits for testing
test_splits = {}
for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
    yearly_data = test[test['Date'].dt.year == year]
    if not yearly_data.empty:
        test_splits[year] = yearly_data

# Output available years
print("Available years for testing data:", list(test_splits.keys()))

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

rf.fit(train[predictors], train["Target"])

# Test the model on each yearly split
results = []
for year, test_data in test_splits.items():
    # Make predictions
    preds = rf.predict(test_data[predictors])
    
    # Calculate metrics
    accuracy = accuracy_score(test_data["Target"], preds)
    precision = precision_score(test_data["Target"], preds, average="weighted")
    
    # Store results
    results.append({"Year": year, "Accuracy": accuracy, "Precision": precision})

    print(f"Year {year}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
print("\nOverall Results:")
print(results_df)

Available years for testing data: [2019, 2020, 2021, 2022, 2023]
Year 2019: Accuracy = 0.6055, Precision = 0.5884
Year 2020: Accuracy = 0.6027, Precision = 0.5917
Year 2021: Accuracy = 0.5760, Precision = 0.5594
Year 2022: Accuracy = 0.5956, Precision = 0.5830
Year 2023: Accuracy = 0.5602, Precision = 0.5462

Overall Results:
   Year  Accuracy  Precision
0  2019  0.605528   0.588351
1  2020  0.602679   0.591718
2  2021  0.575980   0.559401
3  2022  0.595568   0.582976
4  2023  0.560185   0.546172


In [20]:
grouped_matches = matches.groupby("Team")

In [21]:
grouped_test_matches = test_matches.groupby("Team")

In [22]:
def rolling_averages(group, cols, new_cols):
    # Sort the group by the "Date" column to ensure chronological order
    group = group.sort_values("Date")
    
    # Calculate rolling averages over a window of 3 rows, excluding the current row
    # (e.g., for row N, it computes the average of rows N-1, N-2, and N-3)
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    
    # Assign the calculated rolling averages to new columns in the group
    group[new_cols] = rolling_stats
    
    # Drop rows where the new rolling average columns contain NaN values
    # (occurs when there aren't enough previous rows to calculate the average)
    group = group.dropna(subset=new_cols)
    
    return group

In [23]:
cols = [ "GF","GA","Sh", "SoT", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [24]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [25]:
results = []
for team, group in matches.groupby("Team"):
    # Apply rolling averages to each group
    result = rolling_averages(group, cols, new_cols)
    results.append(result)

# Concatenate all results into one DataFrame
matches_rolling = pd.concat(results)

In [26]:
test_results = []
for team, group in test_matches.groupby("Team"):
    # Apply rolling averages to each group
    result = rolling_averages(group, cols, new_cols)
    results.append(result)

# Concatenate all results into one DataFrame
test_matches_rolling = pd.concat(results)

In [27]:
matches_rolling.index = range(matches_rolling.shape[0])

In [28]:
test_matches_rolling.index = range(test_matches_rolling.shape[0])

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score
import pandas as pd

def calculate_rolling_features(df, columns, window_size):
    for col in columns:
        df[f"{col}_rolling"] = df[col].rolling(window=window_size, min_periods=1).mean()
    return df

def make_yearly_predictions_with_rolling(train, test, predictors, rolling_predictors):
    train['Date'] = pd.to_datetime(train['Date'], errors='coerce')
    test['Date'] = pd.to_datetime(test['Date'], errors='coerce')
    train = train.dropna(subset=['Date']).sort_values(by='Date')
    test = test.dropna(subset=['Date']).sort_values(by='Date')

    all_predictors = predictors + rolling_predictors

    yearly_results = []
    for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
        test_year = test[test['Date'].dt.year == year]
        if not test_year.empty:
            print(f"\nTesting on year: {year} with {len(test_year)} matches.")
            rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
            rf.fit(train[all_predictors], train["Target"])
            preds = rf.predict(test_year[all_predictors])
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)
            combined = pd.DataFrame({
                "Year": year,
                "Actual": test_year["Target"],
                "Prediction": preds,
                "Precision": [precision] * len(test_year),
                "Accuracy": [accuracy] * len(test_year)
            })
            yearly_results.append(combined)
            print(f"Year {year}: Precision = {precision:.4f}, Accuracy = {accuracy:.4f}")

    results = pd.concat(yearly_results, ignore_index=True)
    return results


train = matches
test = test_matches

# Define predictors
static_predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rolling_predictors = ["GF_rolling", "GA_rolling", "Sh_rolling", "SoT_rolling", "PK_rolling", "PKatt_rolling"]

# Generate rolling features
feature_columns = ["GF", "GA", "Sh", "SoT", "PK", "PKatt"]
train = calculate_rolling_features(train, feature_columns, window_size=5)
test = calculate_rolling_features(test, feature_columns, window_size=5)

# Run the prediction function
results = make_yearly_predictions_with_rolling(train, test, static_predictors, rolling_predictors)

# Display results
print("\nYearly Predictions and Metrics:")
print(results.groupby("Year")[["Precision", "Accuracy"]].mean())




Testing on year: 2019 with 398 matches.
Year 2019: Precision = 0.6827, Accuracy = 0.6910

Testing on year: 2020 with 672 matches.
Year 2020: Precision = 0.6863, Accuracy = 0.6949

Testing on year: 2021 with 816 matches.
Year 2021: Precision = 0.7106, Accuracy = 0.7145

Testing on year: 2022 with 722 matches.
Year 2022: Precision = 0.7025, Accuracy = 0.7078

Testing on year: 2023 with 432 matches.
Year 2023: Precision = 0.7045, Accuracy = 0.7106

Yearly Predictions and Metrics:
      Precision  Accuracy
Year                     
2019   0.682671  0.690955
2020   0.686260  0.694940
2021   0.710623  0.714461
2022   0.702541  0.707756
2023   0.704528  0.710648


In [37]:
new_matches = pd.read_csv("premierleague_rank_team_data.csv")

In [38]:
new_test_matches = pd.read_csv("premierleague_rank_test_team_data.csv")

In [39]:
new_matches["Date"] = pd.to_datetime(new_matches["Date"])

In [40]:
new_test_matches["Date"] = pd.to_datetime(new_test_matches["Date"])

In [41]:
new_matches["Venue_code"] = new_matches["Venue"].astype("category").cat.codes

In [42]:
new_test_matches["Venue_code"] = new_test_matches["Venue"].astype("category").cat.codes

In [43]:
new_matches["Opp_code"] = new_matches["Opponent"].astype("category").cat.codes 

In [44]:
new_test_matches["Opp_code"] = new_test_matches["Opponent"].astype("category").cat.codes 

In [45]:
new_matches["Hour"] = new_matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [46]:
new_test_matches["Hour"] = new_test_matches["Time"].str.replace(":.+", "", regex=True).fillna("0").astype("int")

In [47]:
new_matches["Day_code"] = new_matches["Date"].dt.dayofweek

In [48]:
new_test_matches["Day_code"] = new_test_matches["Date"].dt.dayofweek

In [49]:
new_matches["Target"]=(new_matches["Result"] == "W").astype("int")

In [50]:
new_test_matches["Target"]=(new_test_matches["Result"] == "W").astype("int")

In [51]:
# Load training and testing data
train = new_matches
test = new_test_matches

# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code","Rank","IsRanked"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
rf.fit(train[predictors], train["Target"])

# Make predictions
preds = rf.predict(test[predictors])
    
# Test the model on each yearly split
results = []

 # Calculate metrics
accuracy = accuracy_score(test["Target"], preds)
precision = precision_score(test["Target"], preds, average="weighted")
print(f"Metrics: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")


Metrics: Accuracy = 0.6237, Precision = 0.6071


In [52]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Load training and testing data
train = new_matches
test = new_test_matches


# Sort data by date
train = train.sort_values(by='Date')
test = test.sort_values(by='Date')

# Create yearly splits for testing
test_splits = {}
for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
    yearly_data = test[test['Date'].dt.year == year]
    if not yearly_data.empty:
        test_splits[year] = yearly_data

# Output available years
print("Available years for testing data:", list(test_splits.keys()))

# Model training
predictors = ["Venue_code", "Opp_code", "Hour", "Day_code","Rank","IsRanked"]
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

rf.fit(train[predictors], train["Target"])

# Test the model on each yearly split
results = []
for year, test_data in test_splits.items():
    # Make predictions
    preds = rf.predict(test_data[predictors])
    
    # Calculate metrics
    accuracy = accuracy_score(test_data["Target"], preds)
    precision = precision_score(test_data["Target"], preds, average="weighted")
    
    # Store results
    results.append({"Year": year, "Accuracy": accuracy, "Precision": precision})

    print(f"Year {year}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
print("\nOverall Results:")
print(results_df)

Available years for testing data: [2019, 2020, 2021, 2022, 2023]
Year 2019: Accuracy = 0.6206, Precision = 0.5967
Year 2020: Accuracy = 0.6369, Precision = 0.6247
Year 2021: Accuracy = 0.6213, Precision = 0.6053
Year 2022: Accuracy = 0.6205, Precision = 0.6055
Year 2023: Accuracy = 0.6157, Precision = 0.5956

Overall Results:
   Year  Accuracy  Precision
0  2019  0.620603   0.596717
1  2020  0.636905   0.624673
2  2021  0.621324   0.605319
3  2022  0.620499   0.605483
4  2023  0.615741   0.595612


In [54]:
new_results = []
for team, group in new_matches.groupby("Team"):
    # Apply rolling averages to each group
    new_result = rolling_averages(group, cols, new_cols)
    new_results.append(new_result)

# Concatenate all results into one DataFrame
new_matches_rolling = pd.concat(new_results)

In [55]:
new_test_results = []
for team, group in new_test_matches.groupby("Team"):
    # Apply rolling averages to each group
    new_result = rolling_averages(group, cols, new_cols)
    new_results.append(new_result)

# Concatenate all results into one DataFrame
new_test_matches_rolling = pd.concat(new_results)

In [56]:
new_matches_rolling.index = range(new_matches_rolling.shape[0])

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score
import pandas as pd

def calculate_rolling_features(df, columns, window_size):
    for col in columns:
        df[f"{col}_rolling"] = df[col].rolling(window=window_size, min_periods=1).mean()
    return df

def make_yearly_predictions_with_rolling(train, test, predictors, rolling_predictors):
    train['Date'] = pd.to_datetime(train['Date'], errors='coerce')
    test['Date'] = pd.to_datetime(test['Date'], errors='coerce')
    train = train.dropna(subset=['Date']).sort_values(by='Date')
    test = test.dropna(subset=['Date']).sort_values(by='Date')

    all_predictors = predictors + rolling_predictors

    yearly_results = []
    for year in range(test['Date'].dt.year.min(), test['Date'].dt.year.max() + 1):
        test_year = test[test['Date'].dt.year == year]
        if not test_year.empty:
            print(f"\nTesting on year: {year} with {len(test_year)} matches.")
            rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
            rf.fit(train[all_predictors], train["Target"])
            preds = rf.predict(test_year[all_predictors])
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)
            combined = pd.DataFrame({
                "Year": year,
                "Actual": test_year["Target"],
                "Prediction": preds,
                "Precision": [precision] * len(test_year),
                "Accuracy": [accuracy] * len(test_year)
            })
            yearly_results.append(combined)
            print(f"Year {year}: Precision = {precision:.4f}, Accuracy = {accuracy:.4f}")

    results = pd.concat(yearly_results, ignore_index=True)
    return results


train = new_matches
test = new_test_matches

# Define predictors
static_predictors = ["Venue_code", "Opp_code", "Hour", "Day_code"]
rolling_predictors = ["GF_rolling", "GA_rolling", "Sh_rolling", "SoT_rolling", "PK_rolling", "PKatt_rolling","Rank", "IsRanked"]

# Generate rolling features
feature_columns = ["GF", "GA", "Sh", "SoT", "PK", "PKatt"]
train = calculate_rolling_features(train, feature_columns, window_size=5)
test = calculate_rolling_features(test, feature_columns, window_size=5)

# Run the prediction function
results = make_yearly_predictions_with_rolling(train, test, static_predictors, rolling_predictors)

# Display results
print("\nYearly Predictions and Metrics:")
print(results.groupby("Year")[["Precision", "Accuracy"]].mean())



Testing on year: 2019 with 398 matches.
Year 2019: Precision = 0.7152, Accuracy = 0.7211

Testing on year: 2020 with 672 matches.
Year 2020: Precision = 0.7170, Accuracy = 0.7232

Testing on year: 2021 with 816 matches.
Year 2021: Precision = 0.7021, Accuracy = 0.7083

Testing on year: 2022 with 722 matches.
Year 2022: Precision = 0.7172, Accuracy = 0.7216

Testing on year: 2023 with 432 matches.
Year 2023: Precision = 0.6558, Accuracy = 0.6667

Yearly Predictions and Metrics:
      Precision  Accuracy
Year                     
2019   0.715211  0.721106
2020   0.716971  0.723214
2021   0.702081  0.708333
2022   0.717218  0.721607
2023   0.655808  0.666667
