In [1]:
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np

In [3]:
# load the dataset
data = pd.read_csv('test_dataset.csv', delimiter=';')

# display the first few rows to verify the data is loaded correctly
print("Initial Data Snapshot:")
print(data.head())

Initial Data Snapshot:
        id  team_a_mot_x_pos  team_b_mot_x_pos  team_a_ppg_dif_l4  \
0  2982690              0.47              0.43           5.680000   
1  2982691              0.51              0.62           0.946667   
2  2982692              0.72              0.54           1.080000   
3  2982693              0.62              0.51           2.840000   
4  2982694              0.41              0.64           1.280000   

   team_a_ppg_dif_l6  team_b_ppg_dif_l4  team_b_ppg_dif_l6  \
0            1.08857            0.56500           0.867273   
1            0.81600            0.81500           0.795000   
2            0.90000            2.17333           3.540000   
3            0.78750            2.08000           3.420000   
4            0.67800            2.88000           1.040000   

   team_a_ratio_shotsOnTarget_overall  team_a_ratio_shotsOnTarget_l4  \
0                            0.373297                       0.275000   
1                            0.385714        

In [4]:
# combine HomeWin, Draw, and AwayWin into a single Outcome column
def get_outcome(row):
    if row['HomeWin'] == 1:
        return 0  # Home Win
    elif row['Draw'] == 1:
        return 1  # Draw
    else:
        return 2  # Away Win
    
data['Outcome'] = data.apply(get_outcome, axis=1)


# Verify the new 'Outcome' column
print("\nOutcome Column Distribution:")
print(data['Outcome'].value_counts())


Outcome Column Distribution:
Outcome
2    14456
Name: count, dtype: int64


In [5]:
# drop unneccessary columns
columns_to_drop = ['HomeWin', 'Draw', 'AwayWin']
data = data.drop(columns=columns_to_drop, axis=1)

In [6]:
# Drop rows with any missing values
data = data.dropna()
# Verify that there are no missing values
print("Missing values after dropping rows:")
print(data.isnull().sum())

Missing values after dropping rows:
id                                    0
team_a_mot_x_pos                      0
team_b_mot_x_pos                      0
team_a_ppg_dif_l4                     0
team_a_ppg_dif_l6                     0
team_b_ppg_dif_l4                     0
team_b_ppg_dif_l6                     0
team_a_ratio_shotsOnTarget_overall    0
team_a_ratio_shotsOnTarget_l4         0
team_a_ratio_shotsOnTarget_l6         0
team_b_ratio_shotsOnTarget_overall    0
team_b_ratio_shotsOnTarget_l4         0
team_b_ratio_shotsOnTarget_l6         0
predict_xg_overall_team_a             0
predict_xg_overall_team_b             0
predict_xg_home_team_a                0
predict_xg_away_team_b                0
team_a_xg_last4_prematch              0
team_b_xg_last4_prematch              0
team_a_xga_last4_prematch             0
team_b_xga_last4_prematch             0
position_a_prematch                   0
position_b_prematch                   0
division                              0
team

In [7]:
!pip install scikit-learn



In [8]:
from sklearn.preprocessing import StandardScaler

In [26]:
# exclude 'id' and 'Outcome' from features to be scaled
features_to_scale = [col for col in data.columns if data[col].dtype in ['float64', 'int64'] and col not in ['id', 'Outcome']]
scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

In [9]:
# prepare final datasets
X = data.drop(['id', 'Outcome'], axis=1)
y = data['Outcome']
ids = data['id']

In [10]:
data

Unnamed: 0,id,team_a_mot_x_pos,team_b_mot_x_pos,team_a_ppg_dif_l4,team_a_ppg_dif_l6,team_b_ppg_dif_l4,team_b_ppg_dif_l6,team_a_ratio_shotsOnTarget_overall,team_a_ratio_shotsOnTarget_l4,team_a_ratio_shotsOnTarget_l6,...,team_a_shots_overall_TSR,team_b_shots_overall_TSR,team_a_shots_overall_l4_TSR,team_b_shots_overall_l4_TSR,team_a_shots_overall_l6_TSR,team_b_shots_overall_l6_TSR,odds_ft_1,odds_ft_x,odds_ft_2,Outcome
0,2982690,0.47,0.43,5.680000,1.088570,0.565000,0.867273,0.373297,0.275000,0.353846,...,0.50,0.50,0.43,0.57,0.42,0.58,2.50,3.20,2.80,2
1,2982691,0.51,0.62,0.946667,0.816000,0.815000,0.795000,0.385714,0.514286,0.517241,...,0.55,0.45,0.44,0.56,0.49,0.51,1.90,3.25,4.40,2
2,2982692,0.72,0.54,1.080000,0.900000,2.173330,3.540000,0.368831,0.523810,0.492958,...,0.48,0.52,0.50,0.50,0.50,0.50,2.40,3.00,3.10,2
3,2982693,0.62,0.51,2.840000,0.787500,2.080000,3.420000,0.391421,0.393939,0.476190,...,0.51,0.49,0.41,0.59,0.50,0.50,2.00,3.25,3.90,2
4,2982694,0.41,0.64,1.280000,0.678000,2.880000,1.040000,0.356948,0.442308,0.464789,...,0.49,0.51,0.61,0.39,0.55,0.45,2.45,3.00,3.00,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14451,7426907,0.58,0.80,0.891429,0.726000,0.666667,0.516000,0.509202,0.451613,0.536585,...,0.51,0.49,0.52,0.48,0.46,0.54,2.75,2.70,2.60,2
14452,7426908,0.66,0.51,0.904000,1.210000,1.293330,0.977143,0.515924,0.500000,0.527273,...,0.47,0.53,0.52,0.48,0.52,0.48,2.75,2.90,2.45,2
14453,7426909,0.51,0.73,1.373330,0.642000,1.000000,0.932727,0.529412,0.585366,0.521739,...,0.52,0.48,0.41,0.59,0.47,0.53,4.75,3.90,1.55,2
14454,7426910,0.80,0.58,1.030000,0.827143,4.000000,5.160000,0.511111,0.490909,0.541667,...,0.64,0.36,0.61,0.39,0.62,0.38,1.57,3.90,4.33,2


In [11]:
data.to_csv('test_dataset_preprocessed.csv', index=False)