# Imports

In [75]:
import pandas as pd
import requests
import joblib
import pickle

# Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Functions

In [71]:
def get_football_data():
    url = "https://soccer-football-info.p.rapidapi.com/matches/by/basic/"
    querystring = {"s": "423f669ed2e3e1bf", "l": "en_US"}
    headers = {
        "X-RapidAPI-Key": "45ddb35fddmsh83c2b6071c04098p189a48jsn7f1c9dd36e16",
        "X-RapidAPI-Host": "soccer-football-info.p.rapidapi.com",
    }
    response = requests.get(url, headers=headers, params=querystring)
    return response.json()

def preprocess_data(data):
    features = ["possession", "opponent", "referee", "total_goals", "outcome"]
    processed_data = pd.DataFrame(columns=features)
    for match in data["result"]:
        match_info_teamA = match.get("teamA", {}).get("stats", {})
        match_info_teamB = match.get("teamB", {}).get("stats", {})

        
        # Skips match with missing data
        if match_info_teamA.get('possession')==None or (match['referee']==None):
            continue
            
        possession_teamA = int(match_info_teamA.get("possession", 0))
        possession_teamB = int(match_info_teamB.get("possession", 0))

        possession_teamA = int(possession_teamA)
        possession_teamB = int(possession_teamB)

        total_goals = int(match["teamA"]["score"]["f"]) + int(match["teamB"]["score"]["f"])

        outcome = "win" if match["teamA"]["score"]["f"] > match["teamB"]["score"]["f"] else (
            "draw" if match["teamA"]["score"]["f"] == match["teamB"]["score"]["f"] else "lose"
        )

        opponent_name = match["teamB"]["name"]
        referee_name = match["referee"]["name"]

        processed_data = processed_data.append({
            "possession": possession_teamA,
            "opponent": opponent_name,
            "referee": referee_name,
            "total_goals": total_goals,
            "outcome": outcome
        }, ignore_index=True)

    return processed_data

def train_model(processed_data):
    X = processed_data.drop("outcome", axis=1)
    y = processed_data["outcome"]

    # Define the categorical features and numerical features
    categorical_features = ["opponent", "referee"]
    numerical_features = ["possession", "total_goals"]

    # Create transformers for one-hot encoding of categorical features
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # Create a preprocessor that applies transformers to different feature sets
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder="passthrough"
    )

    # Create a pipeline that applies the preprocessor and then fits the model
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier())
    ])

    # Fit the model
    model.fit(X, y)

    return model

### Data collection

In [35]:
data = get_football_data()

### Data preprocessing

In [66]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

processed_data = preprocess_data(data)

In [98]:
processed_data

Unnamed: 0,possession,opponent,referee,total_goals,outcome
0,59,Crystal Palace,Andre Marriner,2,win
1,64,Burnley,Jon Moss,3,win
2,63,Watford,Andre Marriner,3,lose
3,49,Hull,Mark Clattenburg,2,win
4,42,Bayern Munich,Anastasios Sidiropoulos,6,lose
5,66,Lincoln City,Anthony Taylor,5,win
6,46,Man City,Andre Marriner,4,draw
7,60,West Ham,Martin Atkinson,3,win
8,68,Leicester,Mike Jones,1,win
9,50,Man Utd,Andre Marriner,2,win


In [97]:
y

0      win
1      win
2     lose
3      win
4     lose
5      win
6     draw
7      win
8      win
9      win
10     win
11     win
12     win
13     win
14     win
15     win
16     win
17     win
18     win
19     win
20    draw
21     win
22     win
Name: outcome, dtype: object

### Model training

In [76]:
trained_model = train_model(processed_data)

In [102]:
# Save the trained model
joblib.dump(trained_model, "trained_model.joblib", protocol=4)
pickle.dump(trained_model, open('trained_model.sav', 'wb'), protocol=4)

### Model test

In [90]:
trained_model.predict(X_test)

array(['win', 'win', 'win', 'win', 'win'], dtype=object)

In [93]:
y_test

15    win
9     win
0     win
8     win
17    win
Name: outcome, dtype: object