In [1]:
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np

In [3]:
# load the dataset
data = pd.read_csv('training_dataset.csv', delimiter=';')

# display the first few rows to verify the data is loaded correctly
print("Initial Data Snapshot:")
print(data.head())

Initial Data Snapshot:
     id  team_a_mot_x_pos  team_b_mot_x_pos  team_a_ppg_dif_l4  \
0  2215              0.15              0.12           0.888889   
1  2216              0.15              0.14           2.000000   
2  2217              0.12              0.11          12.000000   
3  2218              0.17              0.10           2.000000   
4  2219              0.13              0.11           0.333333   

   team_a_ppg_dif_l6  team_b_ppg_dif_l4  team_b_ppg_dif_l6  \
0                1.0                1.0                1.0   
1                1.0                0.6                1.0   
2                1.0                1.5                1.0   
3                1.0                1.6                1.0   
4                1.0                1.0                1.0   

   team_a_ratio_shotsOnTarget_overall  team_a_ratio_shotsOnTarget_l4  \
0                            0.545455                       0.534483   
1                            0.397059                       0.4

In [4]:
# combine HomeWin, Draw, and AwayWin into a single Outcome column
def get_outcome(row):
    if row['HomeWin'] == 1:
        return 0  # Home Win
    elif row['Draw'] == 1:
        return 1  # Draw
    else:
        return 2  # Away Win
    
data['Outcome'] = data.apply(get_outcome, axis=1)


# Verify the new 'Outcome' column
print("\nOutcome Column Distribution:")
print(data['Outcome'].value_counts())


Outcome Column Distribution:
Outcome
0    25087
2    17371
1    15361
Name: count, dtype: int64


In [5]:
# drop unneccessary columns
columns_to_drop = ['HomeWin', 'Draw', 'AwayWin']
data = data.drop(columns=columns_to_drop, axis=1)

In [6]:
# Drop rows with any missing values
data = data.dropna()
# Verify that there are no missing values
print("Missing values after dropping rows:")
print(data.isnull().sum())

Missing values after dropping rows:
id                                    0
team_a_mot_x_pos                      0
team_b_mot_x_pos                      0
team_a_ppg_dif_l4                     0
team_a_ppg_dif_l6                     0
team_b_ppg_dif_l4                     0
team_b_ppg_dif_l6                     0
team_a_ratio_shotsOnTarget_overall    0
team_a_ratio_shotsOnTarget_l4         0
team_a_ratio_shotsOnTarget_l6         0
team_b_ratio_shotsOnTarget_overall    0
team_b_ratio_shotsOnTarget_l4         0
team_b_ratio_shotsOnTarget_l6         0
predict_xg_overall_team_a             0
predict_xg_overall_team_b             0
predict_xg_home_team_a                0
predict_xg_away_team_b                0
team_a_xg_last4_prematch              0
team_b_xg_last4_prematch              0
team_a_xga_last4_prematch             0
team_b_xga_last4_prematch             0
position_a_prematch                   0
position_b_prematch                   0
division                              0
team

In [7]:
!pip install scikit-learn



In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
# exclude 'id' and 'Outcome' from features to be scaled
features_to_scale = [col for col in data.columns if data[col].dtype in ['float64', 'int64'] and col not in ['id', 'Outcome']]
scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

In [10]:
# prepare final datasets
training = data.drop(['id', 'Outcome'], axis=1)
results = data['Outcome']
ids = data['id']

In [11]:
training

Unnamed: 0,team_a_mot_x_pos,team_b_mot_x_pos,team_a_ppg_dif_l4,team_a_ppg_dif_l6,team_b_ppg_dif_l4,team_b_ppg_dif_l6,team_a_ratio_shotsOnTarget_overall,team_a_ratio_shotsOnTarget_l4,team_a_ratio_shotsOnTarget_l6,team_b_ratio_shotsOnTarget_overall,...,team_a_shots_overall_l4_TSR,team_b_shots_overall_l4_TSR,team_a_shots_overall_l6_TSR,team_b_shots_overall_l6_TSR,odds_ft_1,odds_ft_x,odds_ft_2,profit_1,profit_x,profit_2
0,-1.275790,-1.403450,-0.358936,-0.236495,-0.255679,-0.226892,1.547374,0.911481,1.101517,0.768118,...,0.231003,-0.230831,0.149022,-0.148900,-0.091460,-0.023157,0.037143,-0.757734,1.732008,-0.552597
1,-1.275790,-1.318169,0.509232,-0.236495,-0.581872,-0.226892,-0.611188,-0.015057,-0.472370,0.060846,...,-1.603047,1.603212,-1.032534,1.032630,0.289390,0.498795,-0.225744,-0.757734,-0.583311,0.406433
2,-1.403623,-1.446091,8.322747,-0.236495,0.152062,-0.226892,0.040486,0.155312,0.002789,-1.050133,...,-2.182221,2.182383,-1.859624,1.859700,0.371935,0.606046,-0.231643,-0.757734,-0.583311,0.362841
3,-1.190568,-1.488732,0.509232,-0.236495,0.233610,-0.226892,0.047002,-0.810103,0.007541,0.077031,...,-0.734287,0.734455,-0.914379,0.914477,-0.025261,-0.337759,-0.111152,-0.757734,1.456673,-0.552597
4,-1.361012,-1.446091,-0.793020,-0.236495,-0.255679,-0.226892,1.221864,1.381518,0.864176,1.451360,...,1.003234,-1.003060,0.976112,-0.975971,-0.053865,-0.223358,-0.075764,-0.757734,1.556795,-0.552597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57814,1.025207,0.558018,-0.453393,-0.030524,-0.610414,0.144105,-1.378457,0.569065,-0.336857,-0.909085,...,-0.541229,0.541397,-0.678067,0.678171,-0.036703,-0.452160,-0.119578,-0.757734,1.356551,-0.552597
57815,1.025207,-0.209513,0.220132,-0.166330,-0.507313,-0.465860,-0.924115,-1.361764,-1.653294,-1.207253,...,-0.830816,0.830983,-0.796223,0.796324,-0.065307,-0.523660,-0.026893,0.986083,-0.583311,-0.552597
57816,0.386041,-0.209513,-0.548941,-0.510595,0.763674,0.196326,-1.241551,0.453599,0.211494,-0.628672,...,-0.444700,0.444869,-0.441756,0.441865,-0.044875,-0.487910,-0.094301,-0.757734,1.325263,-0.552597
57817,-0.082680,0.856502,-0.334627,-0.125587,-0.149667,0.796087,-0.756094,-0.342876,-1.058743,-0.645197,...,0.906705,-0.906531,0.267178,-0.267053,-0.034251,-0.344909,-0.133902,1.294293,-0.583311,-0.552597


In [12]:
results

0        1
1        2
2        2
3        1
4        1
        ..
57814    1
57815    0
57816    1
57817    0
57818    0
Name: Outcome, Length: 57783, dtype: int64

In [13]:
ids

0           2215
1           2216
2           2217
3           2218
4           2219
          ...   
57814    2982685
57815    2982686
57816    2982687
57817    2982688
57818    2982689
Name: id, Length: 57783, dtype: int64

In [14]:
data.to_csv('dataset_processed.csv', index=False)