
# Premier League Match Prediction Project  
### **Task A — Regression (Predicting Final Scores)**  
### **Task B — Classification (Predicting Match Outcome)**

This notebook follows the updated workflow:

**STEP 1 — Load & Merge Dataset**  
**STEP 2 — Feature Engineering**  
**STEP 3 — Split Dataset into X and y**  
- Regression: `X_reg`, `y_reg_home`, `y_reg_away`  
- Classification: `X_clf`, `y_clf`  

**Task A — Regression (Predicting Final Scores)**  
**Task B — Classification (Predicting Match Outcome)**  

**STEP 4 — Split into Train/Test**  
**STEP 5 — Preprocessing Pipelines**  
**STEP 6 — Train Each Pipeline SEPARATELY**  
**STEP 7 — Evaluate Models**  
**STEP 9 — Compare Model Performance**  
**STEP 10 — Write Analysis + Conclusion**


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Classification Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Regression Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
    mean_squared_error, r2_score
)

pd.set_option("display.max_columns", 100)



# **STEP 1 — Load & Merge Dataset**
Update dataset paths as needed.


In [None]:

# TODO: Update dataset paths
path_2019 = "data/epl_2019_2020.csv"
path_2020 = "data/epl_2020_2021.csv"
path_2021 = "data/epl_2021_2022.csv"

df_19 = pd.read_csv(path_2019)
df_20 = pd.read_csv(path_2020)
df_21 = pd.read_csv(path_2021)

df = pd.concat([df_19, df_20, df_21], ignore_index=True)
df.head()



# **STEP 2 — Feature Engineering**
Define classification + regression targets, map textual outcomes to numeric classes.


In [None]:

home_team_col = "HomeTeam"
away_team_col = "AwayTeam"
home_goals_col = "FTHG"
away_goals_col = "FTAG"
result_col = "FTR"  # 'H','A','D'

result_mapping = {"H": 0, "A": 1, "D": 2}
df["outcome"] = df[result_col].map(result_mapping)

df["home_score"] = df[home_goals_col]
df["away_score"] = df[away_goals_col]

categorical_features = [home_team_col, away_team_col]
numeric_features = [
    col for col in df.columns
    if col not in categorical_features + [result_col, "outcome", "home_score", "away_score"]
    and pd.api.types.is_numeric_dtype(df[col])
]

features = categorical_features + numeric_features

print("Features used:", features)



# **STEP 3 — Split Dataset into X and y**  
## Task A — Regression  
## Task B — Classification  


In [None]:

X_reg = df[features]
y_reg_home = df["home_score"]
y_reg_away = df["away_score"]

X_clf = df[features]
y_clf = df["outcome"]

X_reg.shape, X_clf.shape



# **STEP 4 — Split into Train/Test**
80-20 split.


In [None]:

X_reg_train, X_reg_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(
    X_reg, y_reg_home, y_reg_away, test_size=0.2, random_state=42
)

X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

X_reg_train.shape, X_clf_train.shape



# **STEP 5 — Preprocessing Pipelines**
Separate categorical and numeric pipelines.


In [None]:

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])
preprocessor



# **Task A — Regression (Predicting Final Scores)**  
# **STEP 6A — Train Regression Pipelines (SEPARATELY)**
Models:
- Ridge  
- Lasso  
- RandomForestRegressor  
- GradientBoostingRegressor  


In [None]:

def run_regression(model, params, y_train, y_test):
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    
    grid = GridSearchCV(pipe, params, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    grid.fit(X_reg_train, y_train)
    
    y_pred = grid.predict(X_reg_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    return grid.best_params_, rmse, r2, y_pred

reg_results_home = {}
reg_results_away = {}


In [None]:

ridge_params = {"model__alpha": [0.1, 1, 10]}
lasso_params = {"model__alpha": [0.001, 0.01, 0.1, 1]}
rf_params = {"model__n_estimators": [100, 200], "model__max_depth": [None, 10, 20]}
gb_params = {
    "model__n_estimators": [100, 200],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [2, 3],
}

models = {
    "Ridge": (Ridge(), ridge_params),
    "Lasso": (Lasso(max_iter=10000), lasso_params),
    "RandomForestRegressor": (RandomForestRegressor(), rf_params),
    "GradientBoostingRegressor": (GradientBoostingRegressor(), gb_params)
}

for name, (model, params) in models.items():
    best_params, rmse, r2, pred = run_regression(model, params, y_home_train, y_home_test)
    reg_results_home[name] = {"params": best_params, "rmse": rmse, "r2": r2}
    
    best_params, rmse, r2, pred = run_regression(model, params, y_away_train, y_away_test)
    reg_results_away[name] = {"params": best_params, "rmse": rmse, "r2": r2}

reg_results_home, reg_results_away



# **Task B — Classification (Predicting Match Outcome)**  
# **STEP 6B — Train Classification Models (SEPARATELY)**
Models:
- SVM  
- Random Forest  
- Logistic Regression  


In [None]:

def run_classification(model, params):
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    
    grid = GridSearchCV(pipe, params, cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_clf_train, y_clf_train)
    
    y_pred = grid.predict(X_clf_test)
    acc = accuracy_score(y_clf_test, y_pred)
    f1 = f1_score(y_clf_test, y_pred, average="weighted")
    return grid.best_params_, acc, f1, y_pred

clf_results = {}


In [None]:

svm_params = {"model__kernel": ["rbf", "linear"], "model__C": [0.1,1,10], "model__gamma": ["scale","auto"]}
rf_params = {"model__n_estimators": [100,200], "model__max_depth": [None,10,20]}
logreg_params = {"model__penalty": ["l1","l2"], "model__C": [0.1,1,10], "model__solver":["liblinear"]}

class_models = {
    "SVM": (SVC(probability=True), svm_params),
    "RandomForestClassifier": (RandomForestClassifier(), rf_params),
    "LogisticRegression": (LogisticRegression(max_iter=1000), logreg_params)
}

for name, (model, params) in class_models.items():
    clf_results[name] = run_classification(model, params)

clf_results



# **STEP 7 — Evaluate Models**


In [None]:

pd.DataFrame(reg_results_home)


In [None]:

pd.DataFrame(reg_results_away)


In [None]:

pd.DataFrame({
    name: {"Accuracy": res[1], "F1-score": res[2]}
    for name, res in clf_results.items()
})



# **STEP 10 — Analysis + Conclusion**

Write your analysis here:
- Which regression model performed the best?  
- Which classification algorithm performed the best?  
- Was the dataset balanced?  
- Which features were most important?  
- What improvements could be made in the next iteration?  
