# Import Relevant Libraries

In [133]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PoissonRegressor
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
import time

# Initial Data Preparation

In [128]:
matches_engineered = pd.read_csv('matches_rolling.csv',index_col=[0])
matches_engineered_grouped = matches_engineered.groupby('team')

#Getting list of teams in the current season
matches_2023 = matches_engineered[matches_engineered['season'] == 2022]
teams = matches_2023['team'].unique().tolist()

#Creating dataframe to hold the feature variables for teams
columns = ['team','features','accuracy','precision']
df_features = pd.DataFrame(columns=columns)

#Predictors for match results
general_predictors = ["venue_code", "opp_code"]
cols_form = ["gf", "ga", "sh", "sot", "dist", "form", "xg", "xga", "poss", "sota", "save%", "cs", "psxg", "cmp", "cmp%", "prgdist",
            "ast", "ppa", "prgp", "sca", "gca", "tklw", "int", "tkl+int", "err", "succ", "succ%", "crdy", "fls", "won%"]
cols_avg = ["gf", "ga", "form", "xg", "xga", "poss", "cs"]
form_predictors = [f"{c}_rolling_3" for c in cols_form]
overall_predictors = [f"{c}_rolling_365" for c in cols_avg]
all_predictors = general_predictors + form_predictors + overall_predictors

# Find Best Feature Combination for Each Team's Model

In [159]:
for team in teams:
    print(f"Finding best features for {team}...")
    df = matches_engineered_grouped.get_group(team)
    df = df.sort_values(by='date', ascending=True)
    if len(df) > 100:
        test_n = int(len(df) * 0.2)
    else:
        test_n = int(len(df) * 0.3)
        
    train = df.iloc[:-test_n]
    test = df.iloc[-test_n:]
    print(f"Split {team} into {round((len(train)/(len(train)+len(test))),2)}:{round((len(test)/(len(train)+len(test))),2)} ratio")
    
    best_features = get_best_features(team,train,test,all_predictors)
    df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)
    df_features.to_csv('best_features.csv')

Finding best features for Arsenal...
Split Arsenal into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5151515151515151
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [35:04<00:00,  7.78it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['dist_rolling_3', 'sh_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.696969696969697
Precision score: 0.7142857142857143
------------------------------

Finding best features for Aston Villa...
Split Aston Villa into 0.71:0.29 ratio

---- All Variables Model ----
Accuracy : 0.48148148148148145
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [32:56<00:00,  8.29it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['tklw_rolling_3', 'gca_rolling_3', 'sota_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.8148148148148148
Precision score: 0.8666666666666667
------------------------------

Finding best features for Bournemouth...
Split Bournemouth into 0.71:0.29 ratio

---- All Variables Model ----
Accuracy : 0.38461538461538464
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 16383/16383 [32:43<00:00,  8.34it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['save%_rolling_3', 'xga_rolling_3', 'xg_rolling_365', 'sot_rolling_3', 'gf_rolling_3', 'poss_rolling_365']
---- Best Variables Model ----
Accuracy score: 0.7692307692307693
Precision score: 0.8333333333333334
------------------------------

Finding best features for Brentford...
Split Brentford into 0.73:0.27 ratio

---- All Variables Model ----
Accuracy : 0.5
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [32:42<00:00,  8.35it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['psxg_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.5
Precision score: 0.5
------------------------------

Finding best features for Brighton and Hove Albion...
Split Brighton and Hove Albion into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5333333333333333
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [25:28<00:00, 10.71it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['fls_rolling_3', 'xga_rolling_3', 'int_rolling_3', 'dist_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.8
Precision score: 0.7692307692307693
------------------------------

Finding best features for Chelsea...
Split Chelsea into 0.81:0.19 ratio

---- All Variables Model ----
Accuracy : 0.5555555555555556
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [23:02<00:00, 11.85it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['prgdist_rolling_3', 'cmp_rolling_3', 'xg_rolling_365', 'crdy_rolling_3', 'cmp%_rolling_3', 'dist_rolling_3', 'fls_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7777777777777778
Precision score: 0.75
------------------------------

Finding best features for Crystal Palace...
Split Crystal Palace into 0.81:0.19 ratio

---- All Variables Model ----
Accuracy : 0.6129032258064516
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [25:08<00:00, 10.86it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['ppa_rolling_3', 'cmp%_rolling_3', 'tklw_rolling_3', 'succ_rolling_3', 'ga_rolling_365']
---- Best Variables Model ----
Accuracy score: 0.7741935483870968
Precision score: 0.75
------------------------------

Finding best features for Everton...
Split Everton into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5714285714285714
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [23:15<00:00, 11.74it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['sh_rolling_3', 'succ%_rolling_3', 'fls_rolling_3', 'dist_rolling_3', 'sca_rolling_3', 'cmp_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7857142857142857
Precision score: 0.875
------------------------------

Finding best features for Fulham...
Split Fulham into 0.71:0.29 ratio

---- All Variables Model ----
Accuracy : 0.4666666666666667
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [22:14<00:00, 12.28it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['won%_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.8
Precision score: 0.7
------------------------------

Finding best features for Leeds United...
Split Leeds United into 0.7:0.3 ratio

---- All Variables Model ----
Accuracy : 0.375
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [22:06<00:00, 12.35it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['psxg_rolling_3', 'int_rolling_3', 'opp_code', 'xga_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.75
Precision score: 0.6923076923076923
------------------------------

Finding best features for Leicester City...
Split Leicester City into 0.81:0.19 ratio

---- All Variables Model ----
Accuracy : 0.5806451612903226
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [23:29<00:00, 11.63it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['xg_rolling_365', 'sot_rolling_3', 'fls_rolling_3', 'succ_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.8387096774193549
Precision score: 0.8235294117647058
------------------------------

Finding best features for Liverpool...
Split Liverpool into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5161290322580645
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [25:07<00:00, 10.87it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['form_rolling_365', 'poss_rolling_365', 'xg_rolling_365', 'opp_code', 'sh_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.8387096774193549
Precision score: 1.0
------------------------------

Finding best features for Manchester City...
Split Manchester City into 0.81:0.19 ratio

---- All Variables Model ----
Accuracy : 0.4583333333333333
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [29:20<00:00,  9.31it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['xga_rolling_3', 'succ_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7916666666666666
Precision score: 0.7894736842105263
------------------------------

Finding best features for Manchester United...
Split Manchester United into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.40625
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [26:20<00:00, 10.36it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['tklw_rolling_3', 'xg_rolling_365', 'psxg_rolling_3', 'gca_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.6875
Precision score: 0.65
------------------------------

Finding best features for Newcastle United...
Split Newcastle United into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.43333333333333335
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [30:32<00:00,  8.94it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['opp_code', 'prgp_rolling_3', 'save%_rolling_3', 'cmp%_rolling_3', 'succ%_rolling_3', 'ga_rolling_365', 'dist_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7666666666666667
Precision score: 0.625
------------------------------

Finding best features for Southampton...
Split Southampton into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [34:49<00:00,  7.84it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['xga_rolling_3', 'dist_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7666666666666667
Precision score: 0.75
------------------------------

Finding best features for Tottenham Hotspur...
Split Tottenham Hotspur into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.5806451612903226
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [28:43<00:00,  9.51it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['won%_rolling_3', 'xg_rolling_365', 'xg_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7096774193548387
Precision score: 0.8
------------------------------

Finding best features for West Ham United...
Split West Ham United into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.45454545454545453
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


Training: 100%|██████████| 16383/16383 [27:27<00:00,  9.94it/s]                                                        
  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)



Model Training Complete!
Best feature variables: ['dist_rolling_3', 'tkl+int_rolling_3', 'won%_rolling_3', 'ast_rolling_3', 'succ_rolling_3', 'save%_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.7575757575757576
Precision score: 0.6666666666666666
------------------------------

Finding best features for Wolverhampton Wanderers...
Split Wolverhampton Wanderers into 0.8:0.2 ratio

---- All Variables Model ----
Accuracy : 0.64
Precision: 0.6666666666666666
Average  : 0.26666666666666666
-----------------------------

Starting best feature finding for 14 combinations...


  _warn_prf(average, modifier, msg_start, len(result))
Training: 100%|██████████| 16383/16383 [25:22<00:00, 10.76it/s]                                                        


Model Training Complete!
Best feature variables: ['xga_rolling_3', 'poss_rolling_365', 'prgdist_rolling_3', 'dist_rolling_3', 'xg_rolling_365', 'save%_rolling_3']
---- Best Variables Model ----
Accuracy score: 0.88
Precision score: 1.0
------------------------------




  df_features = df_features.append(pd.Series(best_features, index=df_features.columns), ignore_index=True)


In [158]:
def get_best_features(team,train,test,predictors):
    rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
    rf.fit(train[predictors], train["btts"])
    preds = rf.predict(test[predictors])

    acc = accuracy_score(test["btts"], preds)
    print("")
    print("---- All Variables Model ----")
    print("Accuracy :", acc)
    print("Precision:", prec)
    print("Average  :", avg)
    print("-----------------------------")
    print("")

    importances = rf.feature_importances_
    forest_importances = pd.Series(importances, index=predictors)
    forest_importances = forest_importances.sort_values(ascending=False)
    important_vars = forest_importances.iloc[:14]
    important_vars = important_vars.index.tolist()
    feature_vars = important_vars
    target_var = 'btts'

    # create a list of all possible feature variable combinations
    feature_combinations = []
    for i in range(1, len(feature_vars) + 1):
        for combo in itertools.combinations(feature_vars, i):
            feature_combinations.append(list(combo))

    # train and test models for each feature variable combination
    best_accuracy = 0
    best_feature_combo = None
    print(f"Starting best feature finding for {len(important_vars)} combinations...")
    for feature_combo in tqdm(feature_combinations, desc='Training', bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', position=0):
        rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
        rf.fit(train[feature_combo], train[target_var])
        preds = rf.predict(test[feature_combo])
        accuracy = accuracy_score(test['btts'], preds)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_precision = precision_score(test['btts'], preds)
            best_feature_combo = feature_combo

    # print the best feature variable combination and its accuracy
    print("")
    print("Model Training Complete!")
    print('Best feature variables:', best_feature_combo)
    print('---- Best Variables Model ----')
    print('Accuracy score:', best_accuracy)
    print('Precision score:', best_precision)
    print('------------------------------')
    print('')
    
    # return the best features and data in a list
    best_features = [team,best_feature_combo,round(best_accuracy,2),round(best_precision,2)]
    return best_features