In [190]:
#downloading all the necesaary dependecies
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [191]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [192]:
%run ../Data/Data_Formatting.ipynb

In [193]:
%run ../Data/Ultimate_Hyperparameters.ipynb

In [194]:
%run ../Data/Parameters.ipynb

In [195]:
%run ../Basic_Models/KNN.ipynb

In [196]:
%run ../Basic_Models/LDA.ipynb

In [197]:
%run ../Basic_Models/QDA.ipynb

In [198]:
%run ../Basic_Models/Logistic_Regression.ipynb

In [199]:
%run ../Tree_Models/Bagging.ipynb

In [200]:
%run ../Tree_Models/Boosting.ipynb

In [201]:
%run ../Tree_Models/Classification_Tree.ipynb

In [202]:
%run ../Tree_Models/Random_Forest.ipynb

In [203]:
%run ../Tree_Models/Model_Selection.ipynb

Note: you may need to restart the kernel to use updated packages.


In [204]:
%run ../Basic_Models/Model_Selection.ipynb

Note: you may need to restart the kernel to use updated packages.


In [205]:
#loading the training dataset 
train_path = Path("../Data/premierleague_team_data.csv")
matches = pd.read_csv(train_path)

#loading the testing data 
test_path = Path("../Data/premierleague_test_team_data.csv")
test_matches = pd.read_csv(test_path)

In [206]:
#loading the training dataset with rank
train_path = Path("../Data/premierleague_rank_team_data.csv")
new_matches = pd.read_csv(train_path)

#loading the testing data with rank
test_path = Path("../Data/premierleague_rank_test_team_data.csv")
new_test_matches = pd.read_csv(test_path)

In [207]:
 process_data(matches, test_matches)

In [208]:
 process_data(new_matches,new_test_matches)

# Importance of Training Accuracy 

Training accuracy measures how well a machine learning model fits the training data. It is important to check training accuracy for the following reasons:

1. **Detecting Underfitting**  
   - If the training accuracy is **too low**, it means the model is **not learning enough** patterns from the data.  
   - This could be due to an **overly simple model**, insufficient features, or poor hyperparameters.

2. **Ensuring Model Competency**  
   - A model with **reasonable training accuracy** ensures that it has successfully learned meaningful patterns from the dataset.  
   - If the model cannot achieve high accuracy on the training data, it is unlikely to perform well on new data.

3. **Providing a Baseline for Comparison**  
   - Training accuracy helps us **compare** with testing accuracy to detect **overfitting**.  
   - If training accuracy is significantly higher than testing accuracy, the model might be **memorizing** rather than **generalizing**.

💡 **Key Insight**: While high training accuracy is desirable, it should not be the sole indicator of a good model. We must also check testing accuracy to ensure real-world performance.

# Models with Baseline Predictors

In [157]:
best_basic_model_baseline(matches,matches)

Best C: 1.274275 with Accuracy: 0.6187
Best Model Per Year (by Accuracy):
                 Model  Year  Precision  Accuracy
0  K-Nearest Neighbors  2013   0.681445  0.689474
1  K-Nearest Neighbors  2014   0.626296  0.642105
2  K-Nearest Neighbors  2015   0.637908  0.655263
3  K-Nearest Neighbors  2016   0.664583  0.674603
4  K-Nearest Neighbors  2017   0.676626  0.678304
5  K-Nearest Neighbors  2018   0.643849  0.657895

Best Model Per Year (by Precision):
                           Model  Year  Precision  Accuracy
6   Linear Discriminant Analysis  2013   0.764024  0.618421
7   Linear Discriminant Analysis  2014   0.762509  0.611842
8   Linear Discriminant Analysis  2015   0.765625  0.625000
9   Linear Discriminant Analysis  2016   0.765791  0.625661
10  Linear Discriminant Analysis  2017   0.764931  0.622195
11  Linear Discriminant Analysis  2018   0.765808  0.625731

Overall Best Model (by Accuracy):
Model: K-Nearest Neighbors, Avg Precision: 0.6551, Avg Accuracy: 0.6663

Overall Bes

In [158]:
best_tree_model_baseline(matches,matches)

Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best Model Per Year (by Accuracy):
            Model  Year  Precision  Accuracy
18  Random Forest  2013   0.748812  0.752632
19  Random Forest  2014   0.701010  0.706579
20  Random Forest  2015   0.695327  0.701316
21  Random Forest  2016   0.722408  0.723545
22  Random Forest  2017   0.729307  0.726933
23  Random Forest  2018   0.693971  0.695906

Best Model Per Year (by Precision):
            Model  Year  Precision  Accuracy
18  Random Forest  2013   0.748812  0.752632
19  Random Forest  2014   0.701010  0.706579
20  Random Forest  2015   0.695327  0.701316
21  Random Forest  2016   0.722408  0.723545
22  Random Forest  2017   0.729307  0.726933
23  Random Forest  2018   0.693971  0.695906

Overall Best Model (by Accuracy):
Model: Random Forest, Avg Precision: 0.7151, Avg Accuracy: 0.7178

Overall 

# Models with Baseline Predictors + Rolling Predictors

In [209]:
best_basic_model_rolling(matches,matches)

Best C: 0.033598 with Accuracy: 0.6471
Best Model Per Year (by Accuracy):
                 Model  Year  Precision  Accuracy
0  K-Nearest Neighbors  2013   0.769179  0.742105
1  K-Nearest Neighbors  2014   0.779157  0.753947
2  K-Nearest Neighbors  2015   0.767172  0.743421
3  K-Nearest Neighbors  2016   0.760372  0.747354
4  K-Nearest Neighbors  2017   0.751179  0.729426
5  K-Nearest Neighbors  2018   0.756499  0.733918

Best Model Per Year (by Precision):
                 Model  Year  Precision  Accuracy
0  K-Nearest Neighbors  2013   0.769179  0.742105
1  K-Nearest Neighbors  2014   0.779157  0.753947
2  K-Nearest Neighbors  2015   0.767172  0.743421
3  K-Nearest Neighbors  2016   0.760372  0.747354
4  K-Nearest Neighbors  2017   0.751179  0.729426
5  K-Nearest Neighbors  2018   0.756499  0.733918

Overall Best Model (by Accuracy):
Model: K-Nearest Neighbors, Avg Precision: 0.7639, Avg Accuracy: 0.7417

Overall Best Model (by Precision):
Model: K-Nearest Neighbors, Avg Precision: 0.7

In [162]:
best_tree_model_rollling(matches,matches)

Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best Model Per Year (by Accuracy):
     Model  Year  Precision  Accuracy  Samples
0  Bagging  2013   0.668365  0.663158      NaN
1  Bagging  2014   0.674302  0.672368      NaN
2  Bagging  2015   0.669445  0.678947      NaN
3  Bagging  2016   0.666513  0.677249      NaN
4  Bagging  2017   0.717578  0.713217      NaN
5  Bagging  2018   0.698057  0.690058      NaN

Best Model Per Year (by Precision):
                  Model  Year  Precision  Accuracy  Samples
17        Random Forest  2013   0.676058  0.634211      NaN
12  Classification Tree  2014   0.687369  0.667105      NaN
2               Bagging  2015   0.669445  0.678947      NaN
3               Bagging  2016   0.666513  0.677249      NaN
4               Bagging  2017   0.717578  0.713217      NaN
5               Bagging  2018   0.698057  0.690058      NaN

Overall Best Model (by Accuracy):
Mod

# Models with Full Set Predictors

In [166]:
 best_basic_model_full(new_matches,new_matches)

Best C: 2.335721 with Accuracy: 0.6711
Best Model Per Year (by Accuracy):
                              Model  Year  Precision  Accuracy
0               K-Nearest Neighbors  2013   0.688614  0.694737
1               K-Nearest Neighbors  2014   0.693901  0.694737
2               K-Nearest Neighbors  2015   0.676749  0.681579
21  Quadratic Discriminant Analysis  2016   0.696251  0.702381
4               K-Nearest Neighbors  2017   0.710862  0.706983
11     Linear Discriminant Analysis  2018   0.673066  0.684211

Best Model Per Year (by Precision):
                              Model  Year  Precision  Accuracy
0               K-Nearest Neighbors  2013   0.688614  0.694737
1               K-Nearest Neighbors  2014   0.693901  0.694737
2               K-Nearest Neighbors  2015   0.676749  0.681579
21  Quadratic Discriminant Analysis  2016   0.696251  0.702381
4               K-Nearest Neighbors  2017   0.710862  0.706983
11     Linear Discriminant Analysis  2018   0.673066  0.684211

Overal

In [164]:
best_tree_model_full(new_matches,new_matches)

Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best Model Per Year (by Accuracy):
     Model  Year  Precision  Accuracy  Samples
0  Bagging  2013   0.666423  0.676316      NaN
1  Bagging  2014   0.727299  0.730263      NaN
2  Bagging  2015   0.710227  0.717105      NaN
3  Bagging  2016   0.740196  0.744709      NaN
4  Bagging  2017   0.742344  0.744389      NaN
5  Bagging  2018   0.720639  0.725146      NaN

Best Model Per Year (by Precision):
     Model  Year  Precision  Accuracy  Samples
0  Bagging  2013   0.666423  0.676316      NaN
1  Bagging  2014   0.727299  0.730263      NaN
2  Bagging  2015   0.710227  0.717105      NaN
3  Bagging  2016   0.740196  0.744709      NaN
4  Bagging  2017   0.742344  0.744389      NaN
5  Bagging  2018   0.720639  0.725146      NaN

Overall Best Model (by Accuracy):
Model: Bagging, Avg Precision: 0.7179, Avg Accuracy: 0.7230

Overall Best Model (by Precision)

# Importance of Checking Testing Accuracy

Testing accuracy measures how well a machine learning model performs on **unseen** data. It is crucial to check testing accuracy for the following reasons:

1. **Evaluating Generalization**  
   - The primary goal of machine learning is to create models that **generalize well** to new data.  
   - A high testing accuracy indicates that the model can make reliable predictions on unseen samples.

2. **Detecting Overfitting**  
   - If the training accuracy is high but the testing accuracy is low, it suggests **overfitting**.  
   - Overfitting occurs when the model learns **specific details** of the training data rather than general patterns, making it unreliable for new data.

3. **Validating Model Performance**  
   - A model is only useful if it performs well on real-world data.  
   - Testing accuracy gives us a **realistic expectation** of how the model will behave when deployed.

4. **Comparing Different Models**  
   - By evaluating testing accuracy across different models, we can select the best model for **real-world applications**.  
   - The model with the highest **testing accuracy and precision** is often the best choice.

💡 **Key Insight**: A good model should have **both high training and testing accuracy**. A balance between these ensures that the model is neither too simple (underfitting) nor too complex (overfitting).


# Models with Baseline Predictors

In [210]:
best_basic_model_baseline(matches,test_matches)

Best C: 1.274275 with Accuracy: 0.6187
Best Model Per Year (by Accuracy):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.764545  0.620603
16  Quadratic Discriminant Analysis  2020   0.641389  0.626488
2               K-Nearest Neighbors  2021   0.596386  0.621324
3               K-Nearest Neighbors  2022   0.608177  0.626039
4               K-Nearest Neighbors  2023   0.617145  0.636574

Best Model Per Year (by Precision):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.764545  0.620603
6      Linear Discriminant Analysis  2020   0.765255  0.623512
7      Linear Discriminant Analysis  2021   0.762989  0.613971
18  Quadratic Discriminant Analysis  2022   0.762098  0.608033
9      Linear Discriminant Analysis  2023   0.763396  0.615741

Overall Best Model (by Accuracy):
Model: Quadratic Discriminant Analysis, Avg Precision: 0.6049, Avg Accuracy: 0.6160

Overall Best 

In [211]:
best_tree_model_baseline(matches,test_matches)

Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best ccp_alpha: 0.000865 with Accuracy: 0.6453
Best Model Per Year (by Accuracy):
      Model  Year  Precision  Accuracy
5  Boosting  2019   0.537949  0.605528
6  Boosting  2020   0.596896  0.627976
7  Boosting  2021   0.670648  0.637255
8  Boosting  2022   0.649986  0.624654
9  Boosting  2023   0.624398  0.627315

Best Model Per Year (by Precision):
      Model  Year  Precision  Accuracy
0   Bagging  2019   0.579199  0.600503
6  Boosting  2020   0.596896  0.627976
7  Boosting  2021   0.670648  0.637255
8  Boosting  2022   0.649986  0.624654
9  Boosting  2023   0.624398  0.627315

Overall Best Model (by Accuracy):
Model: Boosting, Avg Precision: 0.6160, Avg Accuracy: 0.6245

Overall Best Model (by Precision):
Model: Boosting, Avg Precision: 0.6160, Avg Accuracy: 0.6245


# Models with Baseline Predictors + Rolling Predictors

In [213]:
best_basic_model_rolling(matches,test_matches)

Best C: 0.033598 with Accuracy: 0.6471
Best Model Per Year (by Accuracy):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.635905  0.650754
1               K-Nearest Neighbors  2020   0.602017  0.630952
17  Quadratic Discriminant Analysis  2021   0.611925  0.632353
13              Logistic Regression  2022   0.633038  0.624654
9      Linear Discriminant Analysis  2023   0.664998  0.664352

Best Model Per Year (by Precision):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.635905  0.650754
1               K-Nearest Neighbors  2020   0.602017  0.630952
17  Quadratic Discriminant Analysis  2021   0.611925  0.632353
13              Logistic Regression  2022   0.633038  0.624654
9      Linear Discriminant Analysis  2023   0.664998  0.664352

Overall Best Model (by Accuracy):
Model: Linear Discriminant Analysis, Avg Precision: 0.6185, Avg Accuracy: 0.6372

Overall Best Mod

In [214]:
best_tree_model_rollling(matches,test_matches)

Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best ccp_alpha: 0.001632 with Accuracy: 0.6397
Best Model Per Year (by Accuracy):
            Model  Year  Precision  Accuracy  Samples
15  Random Forest  2019   0.638640  0.650754      NaN
16  Random Forest  2020   0.592831  0.626488      NaN
2         Bagging  2021   0.625986  0.642157      NaN
3         Bagging  2022   0.630162  0.642659      NaN
4         Bagging  2023   0.650182  0.662037      NaN

Best Model Per Year (by Precision):
            Model  Year  Precision  Accuracy  Samples
15  Random Forest  2019   0.638640  0.650754      NaN
6        Boosting  2020   0.609598  0.619048    672.0
2         Bagging  2021   0.625986  0.642157      NaN
3         Bagging  2022   0.630162  0.642659      NaN
19  Random Forest  2023   0.653098  0.652778      NaN

Overall Best Model (by Accuracy):
Model: Bagging, Avg Precision: 0.6241, Avg Accuracy: 0.6416

Overall Best Model (by Precision):
Model: B

# Models with Full Set Predictors

In [216]:
 best_basic_model_full(new_matches,new_test_matches)

Best C: 2.335721 with Accuracy: 0.6711
Best Model Per Year (by Accuracy):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.664671  0.675879
16  Quadratic Discriminant Analysis  2020   0.645934  0.659226
17  Quadratic Discriminant Analysis  2021   0.651375  0.662990
18  Quadratic Discriminant Analysis  2022   0.657750  0.666205
4               K-Nearest Neighbors  2023   0.639220  0.652778

Best Model Per Year (by Precision):
                              Model  Year  Precision  Accuracy
5      Linear Discriminant Analysis  2019   0.664671  0.675879
16  Quadratic Discriminant Analysis  2020   0.645934  0.659226
17  Quadratic Discriminant Analysis  2021   0.651375  0.662990
13              Logistic Regression  2022   0.660314  0.657895
4               K-Nearest Neighbors  2023   0.639220  0.652778

Overall Best Model (by Accuracy):
Model: Quadratic Discriminant Analysis, Avg Precision: 0.6437, Avg Accuracy: 0.6558

Overall Best 

In [215]:
best_tree_model_full(new_matches,new_test_matches)

Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best ccp_alpha: 0.001282 with Accuracy: 0.6576
Best Model Per Year (by Accuracy):
                  Model  Year  Precision  Accuracy  Samples
15        Random Forest  2019   0.673836  0.683417      NaN
16        Random Forest  2020   0.649283  0.663690      NaN
17        Random Forest  2021   0.632249  0.647059      NaN
3               Bagging  2022   0.671236  0.678670      NaN
14  Classification Tree  2023   0.630183  0.641204      NaN

Best Model Per Year (by Precision):
                  Model  Year  Precision  Accuracy  Samples
15        Random Forest  2019   0.673836  0.683417      NaN
16        Random Forest  2020   0.649283  0.663690      NaN
2               Bagging  2021   0.632512  0.645833      NaN
3               Bagging  2022   0.671236  0.678670      NaN
14  Classification Tree  2023   0.630183  0.641204      NaN

Overall Best Model (by Accuracy):
Model: Random Forest, Avg Precis