In [109]:
import os
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plot
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

Open 'laliga.sqlite' data file.

In [110]:
dir = Path.cwd()
laliga_path = dir.parent / "laliga.sqlite"
if laliga_path.exists() == False:
        raise FileNotFoundError(
            f"Not found 'laliga.sqlite' neither in {laliga_path} nor in {parent_path}"
        )
    
try:
     conn = sqlite3.connect(laliga_path)
except sqlite3.OperationalError as e:
    print(f"Could not connect to data base 'laliga.sqlite'.")

In [111]:
df = pd.read_sql("SELECT * FROM Matches;", conn)
df

Unnamed: 0,season,division,matchday,date,time,home_team,away_team,score
0,1928-1929,1,1,2/10/29,,Arenas Club,Athletic Madrid,2:3
1,1928-1929,1,1,2/10/29,,Espanyol,Real Unión,3:2
2,1928-1929,1,1,2/10/29,,Real Madrid,Catalunya,5:0
3,1928-1929,1,1,2/10/29,,Donostia,Athletic,1:1
4,1928-1929,1,1,2/12/29,,Racing,Barcelona,0:2
...,...,...,...,...,...,...,...,...
48775,2021-2022,2,42,5/29/22,,Real Oviedo,UD Ibiza,
48776,2021-2022,2,42,5/29/22,,Real Sociedad B,Real Zaragoza,
48777,2021-2022,2,42,5/29/22,,Sporting Gijón,UD Las Palmas,
48778,2021-2022,2,42,5/29/22,,CD Tenerife,FC Cartagena,


We clean the data det by removing NaN scores and define *home_goals* and *away_goals*.

In [112]:
matches = df.copy()
valid_matches = matches[matches['score'].notna() & matches['score'].str.contains(':')].copy()
valid_matches[['home_goals', 'away_goals']] = (valid_matches['score'].str.split(':', expand=True).astype(int))

In [113]:
def get_result(row):
    if row['home_goals'] > row['away_goals']:
        return '1'
    elif row['home_goals'] < row['away_goals']:
        return '2'
    else:
        return 'X' 

In [114]:
valid_matches['result'] = valid_matches.apply(get_result, axis=1)

In [115]:
def generate_date(row):
    date = row['date'].split("/")
    month = date[0]
    day = date[1]
    year = date[2]
    season = row['season'].split("-")[0]
    year = season[0:2] + year
    return f"{month}/{day}/{year}"

In [116]:
valid_matches["date"] = valid_matches.apply(generate_date, axis=1)

In [117]:
valid_matches[["month", "day", "year"]] = valid_matches["date"].str.split("/", expand=True).astype(int)

In [118]:
valid_matches = valid_matches.drop(columns=["season", "date", "score", "time", "home_goals", "away_goals"])

In [119]:
teams = pd.unique(valid_matches[["home_team", "away_team"]].values.ravel())

In [120]:
encoder = OneHotEncoder(handle_unknown="ignore", categories=[teams, teams])

In [121]:
encoded = encoder.fit_transform(valid_matches[["home_team", "away_team"]])

In [122]:
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(["home_team", "away_team"]))

In [123]:
valid_matches = valid_matches.drop(columns=["home_team", "away_team"])

In [124]:
df_train = pd.concat([encoded_df, valid_matches], axis=1)
df_train = df_train.loc[df_train["year"] < 2021]

In [125]:
y_train = df_train["result"]
X_train = df_train.drop("result", axis=1)

In [126]:
gradient_boosting = GradientBoostingClassifier()

In [127]:
gradient_boosting.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [128]:
df_test = pd.concat([encoded_df, valid_matches], axis=1)
df_test = df_test.loc[df_test["year"] >= 2021]

In [130]:
y_test = df_test["result"]
X_test = df_test.drop("result", axis=1)

In [131]:
y_pred = gradient_boosting.predict(X_test)

ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values