In [291]:
# DATA IMPORTEREN

import pandas as pd

data = pd.read_csv("data/dataset.csv")

In [292]:
# FILTER COLUMNS

columns_needed = ["HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]
data = data[columns_needed]
data.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,Volendam,Vitesse,1,2,A
1,PSV Eindhoven,Utrecht,2,0,H
2,Heerenveen,Waalwijk,3,1,H
3,Ajax,Heracles,4,1,H
4,Zwolle,Sparta Rotterdam,1,2,A


In [293]:
data["FTR"].value_counts() / len(data)

FTR
H    0.453578
A    0.317698
D    0.228723
Name: count, dtype: float64

In [294]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["FTR"]):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]
    
print(len(train_set), len(test_set))

1654 414


In [295]:
test_set["FTR"].value_counts() / len(test_set)

FTR
H    0.454106
A    0.316425
D    0.229469
Name: count, dtype: float64

In [296]:
data = train_set.copy()
data_labels = train_set[["FTR"]]
# data_labels = data["FTR"].to_numpy()

test_labels = test_set[["FTR"]]
# test_labels = test_set["FTR"].to_numpy()

In [297]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
cat_encoder = OneHotEncoder()
data_labels = cat_encoder.fit_transform(data_labels)
test_labels = cat_encoder.fit_transform(test_labels)

In [298]:
test_set = test_set.drop(["FTHG", "FTAG", "FTR"], axis=1)
train_set = train_set.drop(["FTHG", "FTAG", "FTR"], axis=1)

In [303]:
from sklearn.compose import ColumnTransformer

cat_attribs = ["HomeTeam", "AwayTeam"]
full_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_attribs),
])

data_prepared = full_pipeline.fit_transform(data)

In [304]:
data_prepared = data_prepared.toarray()
data_labels = data_labels.toarray()

In [305]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)


In [306]:
some_data = data.iloc[:5]
some_labels = data_labels[:5]
some_data_prepared = data_prepared[:5]

print("Preditions:", lin_reg.predict(some_data_prepared))

Preditions: [[0.36328125 0.18359375 0.48828125]
 [0.203125   0.3125     0.4765625 ]
 [0.5625     0.31835938 0.125     ]
 [0.55859375 0.11328125 0.34765625]
 [0.3046875  0.21289062 0.484375  ]]


In [308]:
print("Labels:", list(some_labels))

Labels: [array([0., 0., 1.]), array([1., 0., 0.]), array([1., 0., 0.]), array([1., 0., 0.]), array([1., 0., 0.])]


In [309]:
from sklearn.metrics import mean_squared_error
data_predictions = lin_reg.predict(data_prepared)
lin_mse = mean_squared_error(data_labels, data_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

np.float64(0.42568402736452604)

In [310]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)

In [311]:
data_predictions_tree = tree_reg.predict(data_prepared)
tree_mse = mean_squared_error(data_labels, data_predictions_tree)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

np.float64(0.35002268871120074)

In [312]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, data_prepared, data_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores=np.sqrt(-scores)

In [313]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [0.5522024  0.54085538 0.5285934  0.55607472 0.51746815 0.51198651
 0.51905119 0.55422264 0.50806735 0.55667463]
Mean: 0.5345196383894197
Standard Deviation: 0.01862553837020216
