In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [4]:
import os
import glob
from pathlib import Path

path = os.getcwd()+'/Data'
furtherpath = path+'/Results_Cleaned'
file_list = []
for path in Path(furtherpath).rglob('*.csv'):
    file_list.append(path)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in file_list])
#export to csv
combined_csv.to_csv( "combined_clean.csv", index=False, encoding='utf-8-sig')

In [5]:
combined_csv

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Number_Teams,Total_Rounds,Label,...,Total_Streak_Away,Wins_When_Away,Draw_When_Away,Lose_When_Away,Goals_For_When_Away,Goals_Against_When_Away,Streak_When_Home,Streak_When_Away,Elo_Home,Elo_Away
0,Birmingham City,Fulham,2-2,https://www.besoccer.com/match/birmingham-city...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,58.0,48.0
1,Blackburn Rovers,Port Vale,0-0,https://www.besoccer.com/match/blackburn-rover...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,69.0,53.0
2,Charlton Athletic,Barnsley,3-1,https://www.besoccer.com/match/charlton-athlet...,2000,1,championship,24,46,0.0,...,0.0,0,0,0,0,0,0.0,0.0,59.0,58.0
3,Crystal Palace,Crewe Alexandra,1-1,https://www.besoccer.com/match/crystal-palace-...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,61.0,46.0
4,Grimsby Town,Stockport County,0-1,https://www.besoccer.com/match/grimsby-town/st...,2000,1,championship,24,46,2.0,...,0.0,0,0,0,0,0,0.0,0.0,53.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Sampdoria,AlzanoCene,3-2,https://www.besoccer.com/match/sampdoria/alzan...,2000,38,serie_b,20,38,0.0,...,3.8,1,7,10,10,23,7.2,0.6,,
376,Vicenza,FC Savoia 1908,3-2,https://www.besoccer.com/match/vicenza-calcio/...,2000,38,serie_b,20,38,0.0,...,0.0,1,3,14,14,38,9.0,1.0,,
377,Cosenza Calcio,Brescia,2-2,https://www.besoccer.com/match/fortitudo-cosen...,2000,38,serie_b,20,38,1.0,...,5.4,5,9,4,21,24,7.8,1.4,,
378,Pescara,AC Monza,3-3,https://www.besoccer.com/match/pescara-calcio/...,2000,38,serie_b,20,38,1.0,...,4.8,2,7,9,19,29,7.8,3.2,,


In [6]:
combined_csv.columns

Index(['Home_Team', 'Away_Team', 'Result', 'Link', 'Season', 'Round', 'League',
       'Number_Teams', 'Total_Rounds', 'Label', 'Goals_For_Home',
       'Goals_For_Away', 'Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away', 'Elo_Home', 'Elo_Away'],
      dtype='object')

In [17]:
features = ['Goals_For_Home',
       'Goals_For_Away', 'Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]

In [18]:
combined_csv[features]

Unnamed: 0,Goals_For_Home,Goals_For_Away,Position_Home,Points_Home,Total_Wins_Home,Total_Draw_Home,Total_Lose_Home,Total_Goals_For_Home_Team,Total_Goals_Against_Home_Team,Total_Streak_Home,...,Total_Goals_For_Away_Team,Total_Goals_Against_Away_Team,Total_Streak_Away,Wins_When_Away,Draw_When_Away,Lose_When_Away,Goals_For_When_Away,Goals_Against_When_Away,Streak_When_Home,Streak_When_Away
0,2.0,2.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
1,0.0,0.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
2,3.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
3,1.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
4,0.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,3.0,2.0,5,59,16,11,10,42,38,7.8,...,37,48,3.8,1,7,10,10,23,7.2,0.6
376,3.0,2.0,1,64,19,7,11,66,43,4.2,...,34,59,0.0,1,3,14,14,38,9.0,1.0
377,2.0,2.0,11,47,11,14,12,34,39,4.6,...,52,36,5.4,5,9,4,21,24,7.8,1.4
378,3.0,3.0,16,46,10,16,11,59,52,3.6,...,42,43,4.8,2,7,9,19,29,7.8,3.2


In [19]:
def eval_result(result:str):
    result_list = result.split("-")
    try:
        return int(result_list[0])-int(result_list[1])
    except:
        return "Unknown"
    
differenceList = []
for index, row in combined_csv.iterrows():
    result = row['Result']
    difference=eval_result(result=result)
    differenceList.append(difference)
    
combined_csv['difference'] = differenceList

In [20]:
X = np.array(combined_csv[features])
y = np.array(combined_csv['difference'])

In [21]:
X.shape

(146487, 30)

In [22]:
y.shape

(146487,)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [24]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [26]:
b = model.intercept_
m = model.coef_
print(m, b)

[ 1.00000000e+00 -1.00000000e+00  5.06773898e-17 -1.24667754e-14
  3.86209596e-14  1.31957930e-14  2.10412789e-16 -2.36697697e-16
  6.04305127e-17 -1.42992227e-16 -9.55123617e-16 -7.66265855e-16
 -5.34970882e-16  7.96177647e-17 -2.38670947e-17 -3.49472719e-16
 -2.00904076e-15  5.70377079e-15  1.91513472e-15  0.00000000e+00
 -5.55111512e-17 -2.22044605e-16 -1.87350135e-16  1.01307851e-15
  9.57567359e-16  3.74700271e-16 -2.91433544e-16  1.11022302e-16
 -8.32667268e-17 -5.55111512e-17] 4.6074255521944e-15


In [27]:
#r squred value. the closer to one the better
model.score(X_train, y_train)

1.0

At least we know that the model works... This feature set includes Goals for Home and Goals for Away which respectively have a weighting of 1 and -1 for the end result of Goal Difference. Now to do the same but remove these features.

In [28]:
features = ['Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]

In [29]:
X = np.array(combined_csv[features])
y = np.array(combined_csv['difference'])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [31]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [32]:
b = model.intercept_
m = model.coef_
print(m, b)

[-0.0169137  -0.00777828 -0.00564078  0.00914408  0.02304197  0.03266904
 -0.02295493  0.03266082 -0.01339504 -0.01252053 -0.02369427 -0.00627958
  0.00114657  0.01447244  0.0103409   0.00356592 -0.00035685 -0.01213202
 -0.03000784  0.02753704 -0.01808117 -0.00969982 -0.01411911 -0.00126563
  0.00655419 -0.00856458  0.03724002 -0.05610376] 0.40417084269616227


In [33]:
model.score(X_train, y_train)

0.1658780890813737

Assessing the features and their weights, as determined by the model.

In [36]:
weights = list(zip(features, m))

In [38]:
weights_df = df = pd.DataFrame(weights, columns = ['Feature', 'Weight'])

In [39]:
weights_df

Unnamed: 0,Feature,Weight
0,Position_Home,-0.016914
1,Points_Home,-0.007778
2,Total_Wins_Home,-0.005641
3,Total_Draw_Home,0.009144
4,Total_Lose_Home,0.023042
5,Total_Goals_For_Home_Team,0.032669
6,Total_Goals_Against_Home_Team,-0.022955
7,Total_Streak_Home,0.032661
8,Wins_When_Home,-0.013395
9,Draw_When_Home,-0.012521
