In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
import os
import glob
from pathlib import Path

path = os.getcwd()+'/Data'
furtherpath = path+'/Results_Cleaned'
file_list = []
for path in Path(furtherpath).rglob('*.csv'):
    file_list.append(path)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in file_list])
#export to csv
combined_csv.to_csv( "combined_clean.csv", index=False, encoding='utf-8-sig')

In [3]:
combined_csv

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Number_Teams,Total_Rounds,Label,...,Total_Streak_Away,Wins_When_Away,Draw_When_Away,Lose_When_Away,Goals_For_When_Away,Goals_Against_When_Away,Streak_When_Home,Streak_When_Away,Elo_Home,Elo_Away
0,Birmingham City,Fulham,2-2,https://www.besoccer.com/match/birmingham-city...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,58.0,48.0
1,Blackburn Rovers,Port Vale,0-0,https://www.besoccer.com/match/blackburn-rover...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,69.0,53.0
2,Charlton Athletic,Barnsley,3-1,https://www.besoccer.com/match/charlton-athlet...,2000,1,championship,24,46,0.0,...,0.0,0,0,0,0,0,0.0,0.0,59.0,58.0
3,Crystal Palace,Crewe Alexandra,1-1,https://www.besoccer.com/match/crystal-palace-...,2000,1,championship,24,46,1.0,...,0.0,0,0,0,0,0,0.0,0.0,61.0,46.0
4,Grimsby Town,Stockport County,0-1,https://www.besoccer.com/match/grimsby-town/st...,2000,1,championship,24,46,2.0,...,0.0,0,0,0,0,0,0.0,0.0,53.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Sampdoria,AlzanoCene,3-2,https://www.besoccer.com/match/sampdoria/alzan...,2000,38,serie_b,20,38,0.0,...,3.8,1,7,10,10,23,7.2,0.6,,
376,Vicenza,FC Savoia 1908,3-2,https://www.besoccer.com/match/vicenza-calcio/...,2000,38,serie_b,20,38,0.0,...,0.0,1,3,14,14,38,9.0,1.0,,
377,Cosenza Calcio,Brescia,2-2,https://www.besoccer.com/match/fortitudo-cosen...,2000,38,serie_b,20,38,1.0,...,5.4,5,9,4,21,24,7.8,1.4,,
378,Pescara,AC Monza,3-3,https://www.besoccer.com/match/pescara-calcio/...,2000,38,serie_b,20,38,1.0,...,4.8,2,7,9,19,29,7.8,3.2,,


In [4]:
combined_csv.columns

Index(['Home_Team', 'Away_Team', 'Result', 'Link', 'Season', 'Round', 'League',
       'Number_Teams', 'Total_Rounds', 'Label', 'Goals_For_Home',
       'Goals_For_Away', 'Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away', 'Elo_Home', 'Elo_Away'],
      dtype='object')

In [5]:
features = ['Goals_For_Home',
       'Goals_For_Away', 'Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]

In [6]:
combined_csv[features]

Unnamed: 0,Goals_For_Home,Goals_For_Away,Position_Home,Points_Home,Total_Wins_Home,Total_Draw_Home,Total_Lose_Home,Total_Goals_For_Home_Team,Total_Goals_Against_Home_Team,Total_Streak_Home,...,Total_Goals_For_Away_Team,Total_Goals_Against_Away_Team,Total_Streak_Away,Wins_When_Away,Draw_When_Away,Lose_When_Away,Goals_For_When_Away,Goals_Against_When_Away,Streak_When_Home,Streak_When_Away
0,2.0,2.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
1,0.0,0.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
2,3.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
3,1.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
4,0.0,1.0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,3.0,2.0,5,59,16,11,10,42,38,7.8,...,37,48,3.8,1,7,10,10,23,7.2,0.6
376,3.0,2.0,1,64,19,7,11,66,43,4.2,...,34,59,0.0,1,3,14,14,38,9.0,1.0
377,2.0,2.0,11,47,11,14,12,34,39,4.6,...,52,36,5.4,5,9,4,21,24,7.8,1.4
378,3.0,3.0,16,46,10,16,11,59,52,3.6,...,42,43,4.8,2,7,9,19,29,7.8,3.2


In [7]:
def eval_result(result:str):
    result_list = result.split("-")
    try:
        return int(result_list[0])-int(result_list[1])
    except:
        return "Unknown"
    
differenceList = []
for index, row in combined_csv.iterrows():
    result = row['Result']
    difference=eval_result(result=result)
    differenceList.append(difference)
    
combined_csv['difference'] = differenceList

In [8]:
X = np.array(combined_csv[features])
y = np.array(combined_csv['difference'])

In [9]:
X.shape

(146487, 30)

In [10]:
y.shape

(146487,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [13]:
b = model.intercept_
m = model.coef_
print(m, b)

[ 1.00000000e+00 -1.00000000e+00  5.06773898e-17 -1.24667754e-14
  3.86209596e-14  1.31957930e-14  2.10412789e-16 -2.36697697e-16
  6.04305127e-17 -1.42992227e-16 -9.55123617e-16 -7.66265855e-16
 -5.34970882e-16  7.96177647e-17 -2.38670947e-17 -3.49472719e-16
 -2.00904076e-15  5.70377079e-15  1.91513472e-15  0.00000000e+00
 -5.55111512e-17 -2.22044605e-16 -1.87350135e-16  1.01307851e-15
  9.57567359e-16  3.74700271e-16 -2.91433544e-16  1.11022302e-16
 -8.32667268e-17 -5.55111512e-17] 4.6074255521944e-15


In [14]:
#r squred value. the closer to one the better
model.score(X_train, y_train)

1.0

At least we know that the model works... This feature set includes Goals for Home and Goals for Away which respectively have a weighting of 1 and -1 for the end result of Goal Difference. Now to do the same but remove these features.

In [15]:
features = ['Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]

In [16]:
X = np.array(combined_csv[features])
y = np.array(combined_csv['difference'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [18]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [19]:
b = model.intercept_
m = model.coef_
print(m, b)

[-0.0169137  -0.00777828 -0.00564078  0.00914408  0.02304197  0.03266904
 -0.02295493  0.03266082 -0.01339504 -0.01252053 -0.02369427 -0.00627958
  0.00114657  0.01447244  0.0103409   0.00356592 -0.00035685 -0.01213202
 -0.03000784  0.02753704 -0.01808117 -0.00969982 -0.01411911 -0.00126563
  0.00655419 -0.00856458  0.03724002 -0.05610376] 0.40417084269616227


In [20]:
model.score(X_train, y_train)

0.1658780890813737

Assessing the features and their weights, as determined by the model.

In [21]:
weights = list(zip(features, m**2))

In [22]:
weights_df = df = pd.DataFrame(weights, columns = ['Feature', 'Weight_sq'])

In [23]:
weights_df.sort_values(by=['Weight_sq'])

Unnamed: 0,Feature,Weight_sq
16,Total_Draw_Away,1.273437e-07
12,Goals_Against_When_Home,1.314619e-06
23,Lose_When_Away,1.601831e-06
15,Total_Wins_Away,1.271578e-05
2,Total_Wins_Home,3.181845e-05
11,Goals_For_When_Home,3.943308e-05
24,Goals_For_When_Away,4.295747e-05
1,Points_Home,6.050163e-05
25,Goals_Against_When_Away,7.335195e-05
3,Total_Draw_Home,8.361411e-05


Let us try to normalize the data beforehand and view the results.

In [24]:
from sklearn import preprocessing
X_normalized = preprocessing.normalize(X, axis=0)

In [25]:
X

array([[ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ...,
       [11. , 47. , 11. , ..., 24. ,  7.8,  1.4],
       [16. , 46. , 10. , ..., 29. ,  7.8,  3.2],
       [14. , 46. , 10. , ..., 39. ,  5.8,  0.4]])

In [26]:
X_normalized[1100]

array([0.0048496 , 0.00389322, 0.00384342, 0.00293932, 0.00708718,
       0.00505261, 0.00734015, 0.00165169, 0.00438664, 0.00222897,
       0.00405521, 0.00524017, 0.00573972, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00287113, 0.        ])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=4)
model = LinearRegression()
model.fit(X_train, y_train)
b = model.intercept_
m = model.coef_
print(m, b)

[-7.67283045e+01  3.11897299e+09 -2.62161847e+09 -7.03178741e+08
  7.47783305e+01  3.68549394e+02 -2.50185374e+02  5.53678803e+01
 -3.35896169e+01 -2.24686525e+01 -4.09007008e+01 -4.79342939e+01
  6.59230961e+00  6.39817218e+01  1.05686151e+10 -8.77877104e+09
 -2.44957007e+09 -4.03431164e+01 -3.23727079e+02  3.03487237e+02
 -3.10045863e+01 -1.66382937e+01 -2.50678226e+01 -3.13777751e+00
  3.74034757e+01 -6.42861843e+01  7.26349837e+01 -7.57123484e+01] 0.40417632164785233


In [28]:
model.score(X_train, y_train)

0.16587808904218093

In [42]:
X_mod = np.round(model.predict(X_test))
result_mod = X_mod - y_test
len(result_mod[np.where(result_mod == 0)])/len(X_mod)


0.33032971533893096

In [43]:
for i, num in enumerate(result_mod):
    if num > 1:
        result_mod[i] = 1
    elif num < -1:
        result_mod[i] = -1
len(result_mod[np.where(result_mod == 0)])/len(X_mod)
        

0.33032971533893096

In [30]:
model.predict(X_test)

array([0.59824519, 0.59351662, 0.69781264, ..., 0.45592991, 0.62520075,
       0.40417632])

In [31]:
model.score(X_mod, y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1. ... 0. 1. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [32]:
X_train[12]

array([0.00066131, 0.0026507 , 0.00266083, 0.00183708, 0.00092441,
       0.00177285, 0.00110102, 0.00306742, 0.00239271, 0.00111449,
       0.00057932, 0.00183406, 0.00139145, 0.00113098, 0.0021579 ,
       0.0021822 , 0.00148963, 0.00180433, 0.0018539 , 0.00136103,
       0.00139962, 0.00116597, 0.00112647, 0.00201663, 0.00070092,
       0.00133226, 0.00317875, 0.00222304])

In [33]:
weights = list(zip(features, m**2))
weights_df = df = pd.DataFrame(weights, columns = ['Feature', 'Weight_sq'])
weights_df.sort_values(by=['Weight_sq'])

Unnamed: 0,Feature,Weight_sq
23,Lose_When_Away,9.845648
12,Goals_Against_When_Home,43.45855
21,Wins_When_Away,276.8328
9,Draw_When_Home,504.8403
22,Draw_When_Away,628.3957
20,Total_Streak_Away,961.2844
8,Wins_When_Home,1128.262
24,Goals_For_When_Away,1399.02
17,Total_Lose_Away,1627.567
10,Lose_When_Home,1672.867


In [34]:
X[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
combined_csv.to_csv( "combined_clean.csv", index=False, encoding='utf-8-sig')

In [36]:
combined_csv[combined_csv["Round"]>20][features]

Unnamed: 0,Position_Home,Points_Home,Total_Wins_Home,Total_Draw_Home,Total_Lose_Home,Total_Goals_For_Home_Team,Total_Goals_Against_Home_Team,Total_Streak_Home,Wins_When_Home,Draw_When_Home,...,Total_Goals_For_Away_Team,Total_Goals_Against_Away_Team,Total_Streak_Away,Wins_When_Away,Draw_When_Away,Lose_When_Away,Goals_For_When_Away,Goals_Against_When_Away,Streak_When_Home,Streak_When_Away
240,22,18,4,6,10,17,30,3.8,2,4,...,21,17,1.2,2,6,2,10,10,5.0,1.2
241,5,33,9,6,5,32,22,5.8,7,2,...,14,32,1.4,1,2,7,4,15,7.8,1.0
242,12,26,6,8,6,24,21,3.8,4,4,...,25,27,4.0,3,2,5,8,16,4.4,3.0
243,2,41,13,2,5,38,23,6.0,7,1,...,24,29,5.6,1,4,5,13,17,4.6,4.4
244,19,22,5,7,8,28,34,5.6,4,4,...,23,24,3.0,1,2,7,7,15,6.0,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,5,59,16,11,10,42,38,7.8,9,6,...,37,48,3.8,1,7,10,10,23,7.2,0.6
376,1,64,19,7,11,66,43,4.2,15,2,...,34,59,0.0,1,3,14,14,38,9.0,1.0
377,11,47,11,14,12,34,39,4.6,10,5,...,52,36,5.4,5,9,4,21,24,7.8,1.4
378,16,46,10,16,11,59,52,3.6,8,6,...,42,43,4.8,2,7,9,19,29,7.8,3.2


In [37]:
def lin_reg(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    model = LinearRegression()
    model.fit(X_train, y_train)
    b = model.intercept_
    m = model.coef_
    return (m, b, model.score(X_train, y_train))

linear_reg_dict = {}

for i in range(30):
    X = np.array(combined_csv[combined_csv["Round"]>i][features])
    y = np.array(combined_csv[combined_csv["Round"]>i]['difference'])
    linear_reg_dict[i] = lin_reg(X, y)

In [38]:
linear_reg_dict

{0: (array([-0.0169137 , -0.00777828, -0.00564078,  0.00914408,  0.02304197,
          0.03266904, -0.02295493,  0.03266082, -0.01339504, -0.01252053,
         -0.02369427, -0.00627958,  0.00114657,  0.01447244,  0.0103409 ,
          0.00356592, -0.00035685, -0.01213202, -0.03000784,  0.02753704,
         -0.01808117, -0.00969982, -0.01411911, -0.00126563,  0.00655419,
         -0.00856458,  0.03724002, -0.05610376]),
  0.40417084269616227,
  0.1658780890813737),
 1: (array([-1.84010888e-02, -7.76621441e-03, -5.61589822e-03,  9.08148024e-03,
          1.79837971e-02,  3.09565836e-02, -2.17636894e-02,  3.24050757e-02,
         -1.81651647e-02, -1.11639706e-02, -1.44482005e-02, -3.88011725e-03,
         -7.69892072e-04,  1.36602506e-02,  1.08815285e-02,  2.94359856e-03,
          2.05073285e-03, -9.45632946e-03, -2.86849359e-02,  2.62516136e-02,
         -2.35566755e-02, -1.45173304e-02, -1.81897317e-02,  9.36703672e-05,
          6.23974046e-03, -8.19961376e-03,  3.89298235e-02, -5.138

In [39]:
feature_weights_dict = {}
m_values = []
model_score = []
for feat in features:
    feature_weights_dict[feat] = []

for j, _ in enumerate(list(linear_reg_dict.values())):
    for i, feat in enumerate(list(linear_reg_dict.values())[j][0]):
        feature_weights_dict[features[i]].append(feat)
    m_values.append(list(linear_reg_dict.values())[j][1])
    model_score.append(list(linear_reg_dict.values())[j][2])
    
print(feature_weights_dict)    
print(m_values)
print(mode_score)

{'Position_Home': [-0.016913697189088948, -0.018401088757142647, -0.017300479570629777, -0.01642816745722308, -0.015744780277271945, -0.01819213168693474, -0.017205344234898022, -0.017476058814359065, -0.016275906853612403, -0.015059180928335025, -0.013824832361792312, -0.01435325435857346, -0.014329053322870534, -0.013912771613310664, -0.012068327327720398, -0.01324939075980268, -0.010349918619103582, -0.011450993173541636, -0.009078539515320148, -0.008975976832263437, -0.011286953811771997, -0.010138290371134347, -0.01017303078453845, -0.010551635595969304, -0.006294415630353424, -0.009606380791792034, -0.011072640171353424, -0.00845139321191677, -0.010372742360437734, -0.0105089257754317], 'Points_Home': [-0.007778279161564573, -0.00776621440595552, -0.008043386869601767, -0.006000832933057986, -0.007014920218619479, -0.007127676282797306, -0.007960678020577185, -0.0059285108606316, -0.00526347781185622, -0.006730353299644776, -0.004098102247826595, -0.006446075226600803, -0.0040806

NameError: name 'mode_score' is not defined

In [None]:
rounds = [x+1 for x in range(30)]

In [None]:
rounds

In [None]:
feat_df = pd.DataFrame(feature_weights_dict)

In [None]:
feat_df

In [None]:
feat_df["model_score"] = model_score
feat_df["m_value"] = m_values

In [None]:
feat_df

In [None]:
feat_df[feat_df.columns[0]]

In [None]:
%matplotlib inline
fig, axs = plt.subplots(len(feat_df.columns),figsize=(10, 150))
fig.suptitle('Varying Scores when squeezing the data towards Season end')
for i, ax in enumerate(axs):
    ax.plot(rounds, feat_df[feat_df.columns[i]])
    ax.set_title(feat_df.columns[i])
    
plt.show()



In [None]:
features_plus = ['Round','Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]
X = np.array(combined_csv[features_plus])
y = np.array(combined_csv['difference'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
model = LinearRegression()
model.fit(X_train, y_train)
b = model.intercept_
m = model.coef_
print (m, b, model.score(X_train, y_train))

In [None]:
from sklearn.linear_model import SGDRegressor

features = ['Position_Home', 'Points_Home', 'Total_Wins_Home',
       'Total_Draw_Home', 'Total_Lose_Home', 'Total_Goals_For_Home_Team',
       'Total_Goals_Against_Home_Team', 'Total_Streak_Home', 'Wins_When_Home',
       'Draw_When_Home', 'Lose_When_Home', 'Goals_For_When_Home',
       'Goals_Against_When_Home', 'Position_Away', 'Points_Away',
       'Total_Wins_Away', 'Total_Draw_Away', 'Total_Lose_Away',
       'Total_Goals_For_Away_Team', 'Total_Goals_Against_Away_Team',
       'Total_Streak_Away', 'Wins_When_Away', 'Draw_When_Away',
       'Lose_When_Away', 'Goals_For_When_Away', 'Goals_Against_When_Away',
       'Streak_When_Home', 'Streak_When_Away',]
X = np.array(combined_csv[features])
y = np.array(combined_csv['difference'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
model = SGDRegressor(max_iter=1000, tol=1e-3)
model.fit(X_train, y_train)
b = model.intercept_
m = model.coef_
print (m, b, model.score(X_train, y_train))