# First Model for Predicting National Soccer Results

## 1. Importing relevant Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

## 2. Loading the data from the csv-file

In [7]:
#Loading the dataset from a csv-file by using the relative path 
path = "../../../data/results.csv"
data = pd.read_csv(path)

## 3. Checking the dataset

In [8]:
#Showing the first five rows of the dataset
data.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [11]:
#Printing out the shape (rows/columns) of the dataset 
data.shape

(45360, 9)

In [14]:
#Shwoing some more info about the datset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45360 entries, 0 to 45359
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        45360 non-null  object
 1   home_team   45360 non-null  object
 2   away_team   45360 non-null  object
 3   home_score  45360 non-null  int64 
 4   away_score  45360 non-null  int64 
 5   tournament  45360 non-null  object
 6   city        45360 non-null  object
 7   country     45360 non-null  object
 8   neutral     45360 non-null  bool  
dtypes: bool(1), int64(2), object(6)
memory usage: 2.8+ MB


In [16]:
#Counting the home teams
data['home_team'].value_counts()

home_team
Brazil               600
Argentina            583
Mexico               568
Germany              534
England              530
                    ... 
Aymara                 1
Western Australia      1
Sark                   1
Ticino                 1
Hmong                  1
Name: count, Length: 312, dtype: int64

## 4. Cleaning the data

In [21]:
#Checking if there are some duplicates
data.duplicated().sum()

0

In [22]:
#Checking if some values are missing
data.isnull().sum().sort_values(ascending=False)/len(data)

date          0.0
home_team     0.0
away_team     0.0
home_score    0.0
away_score    0.0
tournament    0.0
city          0.0
country       0.0
neutral       0.0
result        0.0
dtype: float64

## 5. Preproccesing the data

### 5.1 New column "result"

In [19]:
# Creating a new column "result" by transforming the results into "Win", "Lose" or "Draw"
data.loc[data['home_score'] == data['away_score'], 'result'] = 'Draw'
data.loc[data['home_score'] > data['away_score'], 'result'] = 'Win'
data.loc[data['home_score'] < data['away_score'], 'result'] = 'Lose'

### 5.2 New column "friendly"

In [32]:
#Showing the tournaments
data['tournament'].value_counts()

tournament
Friendly                                17786
FIFA World Cup qualification             8013
UEFA Euro qualification                  2815
African Cup of Nations qualification     2116
FIFA World Cup                            964
                                        ...  
Évence Coppée Trophy                        1
FIFA 75th Anniversary Cup                   1
Real Madrid 75th Anniversary Cup            1
Copa Confraternidad                         1
The Other Final                             1
Name: count, Length: 152, dtype: int64

In [33]:
#Creating a new column "friendly" to show if its a friendly game (true - 1) or not (false - 0)
data.loc[data['tournament'] != 'Friendly', 'friendly'] = 0
data.loc[data['tournament'] == 'Friendly', 'friendly'] = 1

#Transofrmimg the values in column "friendly" into int because they are from datatype float
data = data.astype({'friendly':'int'})

### 5.3 New column "neutral_encoded"

In [34]:
#Transoforming True into 1 and False into 0
data['neutral_encoded'] = pd.get_dummies(data['neutral'], drop_first=True, dtype="int64")

In [36]:
#Printing out the new column
data.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,result,friendly,neutral_encoded
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,Draw,1,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,Win,1,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Win,1,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,Draw,1,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Win,1,0


### 5.4 Instantiating the Features and the Target

In [38]:
# Instantiating the Target y
y = data['result']

In [39]:
# Instantiating Features X (home_team, away_team, friendly, neutral_encoded) and dropping the other columns
X = data.drop(columns = ['result','neutral', 'city', 'country', 'tournament', 'date', 'home_score', 'away_score'])

## 5.5 One-Hot-Encoder for the categorical Features

In [41]:
#Selecting the categorical Features before encoding them
X_categorical = X.select_dtypes(include = ['object'])

In [42]:
#Saving the numerical Features in a another variable to let our X untouched
X_numerical = X.select_dtypes(include = ['int64'])

In [43]:
# Instantiating the OHE without a min_frequency
ohe = OneHotEncoder(sparse_output = False)

In [44]:
# Fitting it to the categorical features
ohe.fit(X_categorical)

In [45]:
# Printing the different categories detected by the OHE
display(ohe.categories_)

[array(['Abkhazia', 'Afghanistan', 'Albania', 'Alderney', 'Algeria',
        'American Samoa', 'Andalusia', 'Andorra', 'Angola', 'Anguilla',
        'Antigua and Barbuda', 'Arameans Suryoye', 'Argentina', 'Armenia',
        'Artsakh', 'Aruba', 'Australia', 'Austria', 'Aymara', 'Azerbaijan',
        'Bahamas', 'Bahrain', 'Bangladesh', 'Barawa', 'Barbados',
        'Basque Country', 'Belarus', 'Belgium', 'Belize', 'Benin',
        'Bermuda', 'Bhutan', 'Biafra', 'Bolivia', 'Bonaire',
        'Bosnia and Herzegovina', 'Botswana', 'Brazil',
        'British Virgin Islands', 'Brittany', 'Brunei', 'Bulgaria',
        'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
        'Canary Islands', 'Cape Verde', 'Cascadia', 'Catalonia',
        'Cayman Islands', 'Central African Republic', 'Central Spain',
        'Chad', 'Chagos Islands', 'Chameria', 'Chile', 'China PR',
        'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Corsica',
        'Costa Rica', 'County of Nice', 'Croatia', 

In [46]:
# Since Sklearn 1.1, you can retrieve the names of the generated columns
display(ohe.get_feature_names_out())

array(['home_team_Abkhazia', 'home_team_Afghanistan', 'home_team_Albania',
       'home_team_Alderney', 'home_team_Algeria',
       'home_team_American Samoa', 'home_team_Andalusia',
       'home_team_Andorra', 'home_team_Angola', 'home_team_Anguilla',
       'home_team_Antigua and Barbuda', 'home_team_Arameans Suryoye',
       'home_team_Argentina', 'home_team_Armenia', 'home_team_Artsakh',
       'home_team_Aruba', 'home_team_Australia', 'home_team_Austria',
       'home_team_Aymara', 'home_team_Azerbaijan', 'home_team_Bahamas',
       'home_team_Bahrain', 'home_team_Bangladesh', 'home_team_Barawa',
       'home_team_Barbados', 'home_team_Basque Country',
       'home_team_Belarus', 'home_team_Belgium', 'home_team_Belize',
       'home_team_Benin', 'home_team_Bermuda', 'home_team_Bhutan',
       'home_team_Biafra', 'home_team_Bolivia', 'home_team_Bonaire',
       'home_team_Bosnia and Herzegovina', 'home_team_Botswana',
       'home_team_Brazil', 'home_team_British Virgin Islands',
 

In [49]:
# Storing the encoded features
encoded_features = pd.DataFrame(ohe.transform(X_categorical),
                               columns = ohe.get_feature_names_out())

In [51]:
# Checking the result of the OHE
encoded_features

Unnamed: 0,home_team_Abkhazia,home_team_Afghanistan,home_team_Albania,home_team_Alderney,home_team_Algeria,home_team_American Samoa,home_team_Andalusia,home_team_Andorra,home_team_Angola,home_team_Anguilla,...,away_team_Yemen,away_team_Yemen DPR,away_team_Ynys Môn,away_team_Yorkshire,away_team_Yugoslavia,away_team_Zambia,away_team_Zanzibar,away_team_Zimbabwe,away_team_Åland,away_team_Åland Islands
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Concatenating the numerical and the encoded categorical features
X_preprocessed = pd.concat([X_numerical, encoded_features], axis = 1)
X_preprocessed

Unnamed: 0,friendly,neutral_encoded,home_team_Abkhazia,home_team_Afghanistan,home_team_Albania,home_team_Alderney,home_team_Algeria,home_team_American Samoa,home_team_Andalusia,home_team_Andorra,...,away_team_Yemen,away_team_Yemen DPR,away_team_Ynys Môn,away_team_Yorkshire,away_team_Yugoslavia,away_team_Zambia,away_team_Zanzibar,away_team_Zimbabwe,away_team_Åland,away_team_Åland Islands
0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45356,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45357,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45358,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5.6 Encoding the Target

In [56]:
# Instantiating the LabelEncoder
target_encoder = LabelEncoder()

In [57]:
# Fitting it to the target and tranforming it to a new variable y_encoded
y_encoded = target_encoder.fit_transform(y)

In [60]:
# Printing the result in a DataFrame
pd.DataFrame({"target":y, "encoded_target":y_encoded})

Unnamed: 0,target,encoded_target
0,Draw,0
1,Win,2
2,Win,2
3,Draw,0
4,Win,2
...,...,...
45355,Win,2
45356,Draw,0
45357,Win,2
45358,Draw,0


In [61]:
# Showing the shape of the encoded target
y_encoded.shape

(45360,)

## 6. Logistic Regression

In [63]:
# Instantiating the Logistic Regression Model
logreg_model = LogisticRegression(max_iter=1000)

In [64]:
# Splitting the whole dataset into a train and a test set
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((31752, 621), (13608, 621), (31752,), (13608,))

In [66]:
# Doing some Cross-Validation
number_of_folds = 5
cv_scores = cross_val_score(logreg_model, 
                            X_train, y_train, 
                            cv=number_of_folds)

In [67]:
# Printing the Accuracy for each fold
print(f"Here are the accuracies for each of the {number_of_folds} folds: {cv_scores}")


print(f"The average accuracy among these folds is {cv_scores.mean()}")

Here are the accuracies for each of the 5 folds: [0.57298063 0.5720359  0.57496063 0.56251969 0.57606299]
The average accuracy among these folds is 0.571711967983218
