In [18]:
!pip install pandas
!pip install numpy





In [19]:
import pandas as pd
import numpy as np

In [20]:
# load the dataset
data = pd.read_csv('test_dataset.csv', delimiter=';')

# display the first few rows to verify the data is loaded correctly
print("Initial Data Snapshot:")
print(data.head())

Initial Data Snapshot:
        id  team_a_mot_x_pos  team_b_mot_x_pos  team_a_ppg_dif_l4  \
0  2982690              0.47              0.43           5.680000   
1  2982691              0.51              0.62           0.946667   
2  2982692              0.72              0.54           1.080000   
3  2982693              0.62              0.51           2.840000   
4  2982694              0.41              0.64           1.280000   

   team_a_ppg_dif_l6  team_b_ppg_dif_l4  team_b_ppg_dif_l6  \
0            1.08857            0.56500           0.867273   
1            0.81600            0.81500           0.795000   
2            0.90000            2.17333           3.540000   
3            0.78750            2.08000           3.420000   
4            0.67800            2.88000           1.040000   

   team_a_ratio_shotsOnTarget_overall  team_a_ratio_shotsOnTarget_l4  \
0                            0.373297                       0.275000   
1                            0.385714        

In [21]:
# combine HomeWin, Draw, and AwayWin into a single Outcome column
def get_outcome(row):
    if row['HomeWin'] == 1:
        return 0  # Home Win
    elif row['Draw'] == 1:
        return 1  # Draw
    else:
        return 2  # Away Win
    
data['Outcome'] = data.apply(get_outcome, axis=1)


# Verify the new 'Outcome' column
print("\nOutcome Column Distribution:")
print(data['Outcome'].value_counts())


Outcome Column Distribution:
Outcome
2    14456
Name: count, dtype: int64


In [22]:
# drop unneccessary columns
columns_to_drop = ['HomeWin', 'Draw', 'AwayWin']
data = data.drop(columns=columns_to_drop, axis=1)

In [23]:
# Drop rows with any missing values
data = data.dropna()
# Verify that there are no missing values
print("Missing values after dropping rows:")
print(data.isnull().sum())

Missing values after dropping rows:
id                                    0
team_a_mot_x_pos                      0
team_b_mot_x_pos                      0
team_a_ppg_dif_l4                     0
team_a_ppg_dif_l6                     0
team_b_ppg_dif_l4                     0
team_b_ppg_dif_l6                     0
team_a_ratio_shotsOnTarget_overall    0
team_a_ratio_shotsOnTarget_l4         0
team_a_ratio_shotsOnTarget_l6         0
team_b_ratio_shotsOnTarget_overall    0
team_b_ratio_shotsOnTarget_l4         0
team_b_ratio_shotsOnTarget_l6         0
predict_xg_overall_team_a             0
predict_xg_overall_team_b             0
predict_xg_home_team_a                0
predict_xg_away_team_b                0
team_a_xg_last4_prematch              0
team_b_xg_last4_prematch              0
team_a_xga_last4_prematch             0
team_b_xga_last4_prematch             0
position_a_prematch                   0
position_b_prematch                   0
division                              0
team

In [24]:
!pip install scikit-learn



In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
# exclude 'id' and 'Outcome' from features to be scaled
features_to_scale = [col for col in data.columns if data[col].dtype in ['float64', 'int64'] and col not in ['id', 'Outcome']]
scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

In [27]:
# prepare final datasets
X = data.drop(['id', 'Outcome'], axis=1)
y = data['Outcome']
ids = data['id']

In [28]:
data

Unnamed: 0,id,team_a_mot_x_pos,team_b_mot_x_pos,team_a_ppg_dif_l4,team_a_ppg_dif_l6,team_b_ppg_dif_l4,team_b_ppg_dif_l6,team_a_ratio_shotsOnTarget_overall,team_a_ratio_shotsOnTarget_l4,team_a_ratio_shotsOnTarget_l6,...,team_a_shots_overall_TSR,team_b_shots_overall_TSR,team_a_shots_overall_l4_TSR,team_b_shots_overall_l4_TSR,team_a_shots_overall_l6_TSR,team_b_shots_overall_l6_TSR,odds_ft_1,odds_ft_x,odds_ft_2,Outcome
0,2982690,0.162934,-0.004100,3.395043,-0.134039,-0.588403,-0.316314,-0.377833,-1.848289,-0.974883,...,0.005995,-0.005976,-0.564633,0.564810,-0.705347,0.705479,-0.033760,-0.448623,-0.370737,2
1,2982691,0.328167,0.778993,-0.281476,-0.359628,-0.380694,-0.376067,-0.334975,0.195736,0.216402,...,0.235991,-0.235972,-0.478888,0.479063,-0.056927,0.057056,-0.371064,-0.400949,0.214150,2
2,2982692,1.195642,0.449270,-0.177912,-0.290106,0.747852,1.893409,-0.393248,0.277092,0.039358,...,-0.086003,0.086023,0.035588,-0.035423,0.035704,-0.035576,-0.089978,-0.639320,-0.261071,2
3,2982693,0.782559,0.325623,1.189132,-0.383215,0.670310,1.794197,-0.315276,-0.832290,-0.082894,...,0.051994,-0.051975,-0.736125,0.736306,0.035704,-0.035576,-0.314846,-0.400949,0.031373,2
4,2982694,-0.084916,0.861424,-0.022566,-0.473841,1.334977,-0.173509,-0.434264,-0.419113,-0.166017,...,-0.040004,0.040023,0.978794,-0.978646,0.498861,-0.498735,-0.061869,-0.639320,-0.297626,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14451,7426907,0.617325,1.520871,-0.324381,-0.434115,-0.503935,-0.606735,0.091261,-0.339628,0.357435,...,0.051994,-0.051975,0.207080,-0.206918,-0.334822,0.334951,0.106783,-0.925364,-0.443848,2
14452,7426908,0.947792,0.325623,-0.314616,-0.033539,0.016718,-0.225477,0.114463,0.073703,0.289543,...,-0.132003,0.132022,0.207080,-0.206918,0.220967,-0.220839,0.106783,-0.734668,-0.498681,2
14453,7426909,0.328167,1.232363,0.049926,-0.503636,-0.226990,-0.262199,0.161018,0.802915,0.249196,...,0.097994,-0.097974,-0.736125,0.736306,-0.242190,0.242320,1.231127,0.218814,-0.827680,2
14454,7426910,1.526109,0.614131,-0.216748,-0.350405,2.265511,3.232772,0.097850,-0.003954,0.394487,...,0.649985,-0.649966,0.978794,-0.978646,1.147281,-1.147157,-0.556580,0.218814,0.188561,2


In [29]:
data.to_csv('test_dataset_preprocessed.csv', index=False)