# 1.0 Library install and import

In [None]:
!pip install lazypredict
!pip install numpy
!pip install pandas
!pip install sklearn

In [2]:
import pandas as pd
import numpy as np
import numpy.ma
import os 

from google.colab import drive
from google.colab import files
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


##1.1 Google drive file creation

In [3]:
sport = 'Soccer'
file_name_1 = "/content/gdrive/MyDrive/Colab/" + sport + '/'
file_name_2 = '/content/gdrive/MyDrive/Colab/' + sport + '/Training/'
file_name_3 = '/content/gdrive/MyDrive/Colab/' + sport + '/Predictions/'
!mkdir $file_name_1
!mkdir $file_name_2
!mkdir $file_name_3

mkdir: cannot create directory ‘/content/gdrive/MyDrive/Colab/Soccer/’: File exists
mkdir: cannot create directory ‘/content/gdrive/MyDrive/Colab/Soccer/Training/’: File exists
mkdir: cannot create directory ‘/content/gdrive/MyDrive/Colab/Soccer/Predictions/’: File exists


# 2.0 Data cleaning and processing

## 2.2 Loading the data

In [5]:
selection = ['Home Team', 'Away Team', 'Result']

df_1 = pd.read_csv('/content/champions-league-2020-UTC.csv')
df_2 = pd.read_csv('/content/champions-league-2021-UTC.csv')
df_3 = pd.read_csv('/content/champions-league-2022-UTC.csv')

df_1 = df_1[selection]
df_2 = df_2[selection]
df_3 = df_3[selection]

## 2.3 Data cleaning

### 2.3.1 Joining 2020, 2021, 2022 Data cleaning

In [6]:
df_4 = df_1.append(df_2)
df_4 = df_4.append(df_3)
df_4 = df_4.dropna()
df_4.head()

Unnamed: 0,Home Team,Away Team,Result
0,Zenit,Club Brugge,1 - 2
1,Dynamo Kyiv,Juventus,0 - 2
2,Rennes,Krasnodar,1 - 1
3,Chelsea,Sevilla,0 - 0
4,Lazio,Dortmund,3 - 1


In [7]:
df_5 = df_4.rename(columns={"Home Team": "Home", "Away Team": "Away"})
df_5 = df_5.reset_index(drop=True)
df_5.head()

Unnamed: 0,Home,Away,Result
0,Zenit,Club Brugge,1 - 2
1,Dynamo Kyiv,Juventus,0 - 2
2,Rennes,Krasnodar,1 - 1
3,Chelsea,Sevilla,0 - 0
4,Lazio,Dortmund,3 - 1


### 2.3.2 Team - index categorization

In [8]:
selection_1 = ['Home', 'Away']
df_6 = df_5[selection_1]
df_6

Unnamed: 0,Home,Away
0,Zenit,Club Brugge
1,Dynamo Kyiv,Juventus
2,Rennes,Krasnodar
3,Chelsea,Sevilla
4,Lazio,Dortmund
...,...,...
341,Milan,Salzburg
342,Man City,Sevilla
343,Copenhagen,Dortmund
344,Juventus,Paris


In [33]:
df_7 = df_6['Home'].str.replace('\W', '', regex=True)
df_8 = df_6['Away'].str.replace('\W', '', regex=True)
df_9 = df_7.to_frame()
df_10 = df_8.to_frame()

df_10 = pd.concat([df_9, df_10], axis=1).reindex(df_8.index)
df_10.head()

Unnamed: 0,Home,Away
0,Zenit,ClubBrugge
1,DynamoKyiv,Juventus
2,Rennes,Krasnodar
3,Chelsea,Sevilla
4,Lazio,Dortmund


In [34]:
df_11 = pd.DataFrame(df_10, columns = ['Home'])
df_11 = df_11.squeeze()
df_11.head()

df_12 = pd.DataFrame(df_10, columns = ['Away'])
df_12 = df_12.squeeze()
df_12.head()

0    ClubBrugge
1      Juventus
2     Krasnodar
3       Sevilla
4      Dortmund
Name: Away, dtype: object

In [35]:
df_13 = df_10
df_13 = df_13.index
df_13

RangeIndex(start=0, stop=346, step=1)

In [36]:
df_14 = df_10["Home"].unique()
df_14

array(['Zenit', 'DynamoKyiv', 'Rennes', 'Chelsea', 'Lazio', 'Barcelona',
       'Paris', 'Leipzig', 'Salzburg', 'RealMadrid', 'Bayern',
       'Internazionale', 'Olympiacos', 'ManCity', 'Ajax', 'Midtjylland',
       'LokomotivMoskva', 'ShakhtarDonetsk', 'Atlético',
       'Mönchengladbach', 'Porto', 'Marseille', 'Atalanta', 'Liverpool',
       'Krasnodar', 'IstanbulBasaksehir', 'Sevilla', 'ClubBrugge',
       'Dortmund', 'Ferencváros', 'Juventus', 'ManUnited', 'YoungBoys',
       'Villarreal', 'LOSC', 'Malmö', 'Besiktas', 'Sheriff', 'SportingCP',
       'Inter', 'Milan', 'Benfica', 'Wolfsburg', 'DinamoZagreb', 'Celtic',
       'Frankfurt', 'Napoli', 'Tottenham', 'Plzen', 'Leverkusen',
       'Rangers', 'Copenhagen', 'MHaifa'], dtype=object)

In [37]:
df_15 = df_10["Away"].unique()
df_15

array(['ClubBrugge', 'Juventus', 'Krasnodar', 'Sevilla', 'Dortmund',
       'Ferencváros', 'ManUnited', 'IstanbulBasaksehir',
       'LokomotivMoskva', 'ShakhtarDonetsk', 'Atlético',
       'Mönchengladbach', 'Marseille', 'Porto', 'Liverpool', 'Atalanta',
       'Bayern', 'Internazionale', 'Salzburg', 'RealMadrid', 'Olympiacos',
       'ManCity', 'Ajax', 'Midtjylland', 'Chelsea', 'Paris', 'Rennes',
       'Lazio', 'Zenit', 'DynamoKyiv', 'Barcelona', 'Leipzig', 'Benfica',
       'Wolfsburg', 'Milan', 'Besiktas', 'Inter', 'SportingCP', 'Sheriff',
       'YoungBoys', 'Malmö', 'Villarreal', 'LOSC', 'Copenhagen', 'MHaifa',
       'Rangers', 'Leverkusen', 'Plzen', 'Tottenham', 'Frankfurt',
       'DinamoZagreb', 'Celtic', 'Napoli'], dtype=object)

In [None]:
 df_16 = numpy.ma.concatenate([df_14, df_15])
 df_16

In [None]:
df_17 = pd.DataFrame(numpy.ma.filled(df_16))
df_18 = df_17.drop_duplicates()
df_18['Index'] = df_18.index
df_18.columns.values[0] = "Team"
df_18.to_csv("Team-index.csv", index= False)
df_18.head()

In [40]:
df_19 = pd.Series(df_18.Team)
print(df_19.head())

0         Zenit
1    DynamoKyiv
2        Rennes
3       Chelsea
4         Lazio
Name: Team, dtype: object


In [41]:
df_20 = pd.DataFrame(df_18).to_numpy()
df_21 = pd.DataFrame(df_18['Team']).to_numpy()
df_22 = pd.DataFrame(df_18['Index']).to_numpy()

### 2.3.3 Categorization

In [None]:
for n in df_13:
  for i in df_22:
      if df_20[i, 0] == df_11[n]:
        df_11[n] = df_20[i, 1]
        print(df_20[i, 1])

for n in df_13:
  for i in df_22:
      if df_20[i, 0] == df_12[n]:
        df_12[n] = df_20[i, 1]
        print(df_20[i, 1])

In [44]:
df_11.head()

0    [0]
1    [1]
2    [2]
3    [3]
4    [4]
Name: Home, dtype: object

In [45]:
df_12.head()

0    [27]
1    [30]
2    [24]
3    [26]
4    [28]
Name: Away, dtype: object

In [46]:
df_11 = pd.DataFrame(df_11, columns = ['Home'])
df_12 = pd.DataFrame(df_12, columns = ['Away'])

In [47]:
df_11.to_csv('Home.csv')
df_12.to_csv('Away.csv')

In [53]:
df_22 = df_11.set_index(df_11.index).join(df_12.set_index(df_12.index))

df_22 = df_22.astype(str)
df_22 = df_22.applymap(lambda x: float(pd.Series(x).str.extract(r'\[(\d+)\]')[0]))

df_22

Unnamed: 0,Home,Away
0,0.00,27.00
1,1.00,30.00
2,2.00,24.00
3,3.00,26.00
4,4.00,28.00
...,...,...
341,40.00,8.00
342,13.00,26.00
343,51.00,28.00
344,30.00,6.00


## 2.4 Score preparation

In [54]:
df_23 = df_5
df_23 = df_23["Result"].str.replace('\W', ' ', regex=True)
df_23 = pd.DataFrame(df_23)
df_23 = df_23['Result'].str.split(' ', expand=True)
df_23

Unnamed: 0,0,1,2,3
0,1,,,2
1,0,,,2
2,1,,,1
3,0,,,0
4,3,,,1
...,...,...,...,...
341,4,,,0
342,3,,,1
343,1,,,1
344,1,,,2


In [56]:
df_23

Unnamed: 0,0,1,2,3
0,1,,,2
1,0,,,2
2,1,,,1
3,0,,,0
4,3,,,1
...,...,...,...,...
341,4,,,0
342,3,,,1
343,1,,,1
344,1,,,2


In [57]:
df_24 = df_23.drop(df_23.columns[[1, 2]],axis = 1)
mapping = {df_24.columns[0]: 'Home Goals', df_24.columns[1]: 'Away Goles'}
df_24 = df_24.rename(columns=mapping)
df_24 = pd.DataFrame(df_24)
df_24.to_csv("Goles.csv", index= False)
print(df_24) 

    Home Goals Away Goles
0            1          2
1            0          2
2            1          1
3            0          0
4            3          1
..         ...        ...
341          4          0
342          3          1
343          1          1
344          1          2
345          1          6

[346 rows x 2 columns]


In [61]:
df_25 = pd.read_csv("Goles.csv")

In [63]:
# create a list of our conditions

conditions = [
    (df_25['Home Goals'] > df_25['Away Goles']),
    (df_25['Home Goals'] == df_25['Away Goles']),
    (df_25['Home Goals'] < df_25['Away Goles'])
]
     
# create a list of the values we want to assign for each condition
values = ['1', '2', '3']

# create a new column and use np.select to assign values to it using our lists as arguments
df_25['Outcome'] = np.select(conditions, values)

# display updated DataFrame
df_25 = pd.DataFrame(df_25)
df_25.to_csv("Goles-Outcome.csv", index= False)
df_25.head()

Unnamed: 0,Home Goals,Away Goles,Outcome
0,1,2,3
1,0,2,3
2,1,1,2
3,0,0,2
4,3,1,1


## Training matriz creation

In [64]:
df_26 = pd.read_csv("Goles-Outcome.csv")
df_26.head()

Unnamed: 0,Home Goals,Away Goles,Outcome
0,1,2,3
1,0,2,3
2,1,1,2
3,0,0,2
4,3,1,1


In [66]:
df_27 = pd.concat([df_22, df_26["Outcome"]], axis=1)
df_27.to_csv("All.csv")

In [67]:
XY = pd.read_csv('/content/All.csv')
XY = XY.astype('float')

In [68]:
X = XY[{'Home','Away'}]
y = XY['Outcome']
y = pd.DataFrame(y).to_numpy()
X = pd.DataFrame(X).to_numpy()

#3.0 Training

##  3.1 Train-test splitting

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

In [73]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((173, 2), (173, 2), (173, 1), (173, 1))

## 3.1 Model Evaluation

In [74]:
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 29/29 [00:01<00:00, 26.12it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CalibratedClassifierCV,0.51,0.41,,0.44,0.08
LinearDiscriminantAnalysis,0.49,0.4,,0.43,0.02
RidgeClassifierCV,0.49,0.4,,0.43,0.03
LogisticRegression,0.49,0.4,,0.43,0.03
AdaBoostClassifier,0.47,0.39,,0.44,0.12
QuadraticDiscriminantAnalysis,0.49,0.39,,0.42,0.01
RidgeClassifier,0.49,0.39,,0.42,0.02
LinearSVC,0.49,0.39,,0.42,0.02
GaussianNB,0.49,0.39,,0.42,0.01
SGDClassifier,0.38,0.39,,0.32,0.02


In [75]:
import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

In [76]:
from sklearn.metrics import precision_score
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.svm import SVR
from sklearn.utils import shuffle
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(objective='multiclass', random_state=5)

lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)
r2 = lgbm.score(X_test, y_test)
precision = precision_score(y_test, y_pred, average="weighted")
print("Precision score:", precision)

Precision score: 0.38198096397062015


In [79]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
calibrated_clf = CalibratedClassifierCV(clf, cv=5)

calibrated_clf.fit(X_train, y_train)
probas = calibrated_clf.predict_proba(X_test)
probas

array([[0.38657682, 0.19704968, 0.41637349],
       [0.50353788, 0.19048463, 0.30597748],
       [0.45682966, 0.20524275, 0.33792759],
       [0.51503323, 0.196979  , 0.28798777],
       [0.4217771 , 0.19464791, 0.38357499],
       [0.35359711, 0.20367866, 0.44272423],
       [0.38241197, 0.19838258, 0.41920545],
       [0.51941944, 0.19888204, 0.28169852],
       [0.50837206, 0.19436437, 0.29726357],
       [0.35466538, 0.20470555, 0.44062907],
       [0.46578543, 0.19682284, 0.33739173],
       [0.38484421, 0.19728574, 0.41787006],
       [0.45002989, 0.19348202, 0.35648809],
       [0.46549073, 0.1976554 , 0.33685386],
       [0.3701579 , 0.19994615, 0.42989595],
       [0.33609618, 0.20881409, 0.45508974],
       [0.52336084, 0.19871071, 0.27792846],
       [0.51647692, 0.20790432, 0.27561876],
       [0.51240464, 0.20461969, 0.28297567],
       [0.41703734, 0.1936483 , 0.38931436],
       [0.46477372, 0.1994056 , 0.33582068],
       [0.5143185 , 0.20456528, 0.28111622],
       [0.

In [80]:
X = np.array([9., 23.])
y_pred = calibrated_clf.predict_proba(X.reshape(1, -1))  # pass the input as a 2D array
print(y_pred)


[[0.47980581 0.19904277 0.32115142]]


In [81]:
svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
svr_lin = SVR(kernel="linear", C=100, gamma="auto")
svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)
lw = 2

svrs = [svr_rbf, svr_lin, svr_poly]
kernel_label = ["RBF", "Linear", "Polynomial"]
model_color = ["m", "c", "g"]
svr_rbf.fit(X_train, y_train).predict(X_test)
r2 = svr_rbf.score(X_test, y_test)
Y_pred = svr_rbf.predict(X_test)
r2

-1.3617318827020246