# Preprocessing for ML2 semestral assignemnt


## Dictionaries


In [2]:
import pandas as pd
import os
import glob
import re


## 1. Loading data


In [3]:
DATA_DIR = "data/"
all_data = {}
csv_files = glob.glob(os.path.join(DATA_DIR, "**", "*.csv"), recursive=True)

def generate_renamed_filename(file_path, data_dir):
    rel_path = os.path.relpath(file_path, data_dir)
    path_parts = rel_path.split(os.sep)

    if len(path_parts) >= 3:
        country, league, season_file = path_parts[0], path_parts[1], path_parts[2]
        season = os.path.splitext(season_file)[0]
        return f"{country}_{league}_{season}.csv"
    elif len(path_parts) == 2:
        country, season_file = path_parts[0], path_parts[1]
        season = os.path.splitext(season_file)[0]
        return f"{country}_{season}.csv"
    else:
        return os.path.basename(file_path)

for file_path in csv_files:
    renamed_filename = generate_renamed_filename(file_path, DATA_DIR)
    try:
        df = pd.read_csv(file_path)
        all_data[renamed_filename] = df
    except Exception as e:
        print(f"Error loading {os.path.basename(file_path)}: {e}")

print(f"Files created: {len(all_data)}")
files = list(all_data.keys())
if files:
    print(f"First file: {files[0]}")
    print(f"Last file: {files[-1]}")


Files created: 126
First file: england_0_2122.csv
Last file: greece_1_1920.csv


## 2. Creating subsets - 21 leagues

In [4]:
global_df = pd.concat(all_data.values(), ignore_index=True)
print("Global dataframe created with shape:", global_df.shape)

Global dataframe created with shape: (42593, 137)


In [5]:
league_subsets = {}
for filename, df in all_data.items():
    # Extract league identifier from filename (e.g., 'turkey_1', 'belgium_1')
    league_match = re.match(r'(.+_\d+)_', filename)
    if league_match:
        league_name = league_match.group(1)
        if league_name not in league_subsets:
            league_subsets[league_name] = []
        league_subsets[league_name].append(df)

combined_league_data = {}
for league_name, dfs in league_subsets.items():
    combined_league_data[league_name] = pd.concat(dfs, ignore_index=True)

print("Created league subsets:")
for league_name in combined_league_data.keys():
    print(league_name)

Created league subsets:
england_0
england_1
england_3
england_2
france_1
france_2
belgium_1
turkey_1
netherlands_1
spain_1
spain_2
germany_1
germany_2
portugal_1
scotland_0
scotland_1
scotland_3
scotland_2
italy_1
italy_2
greece_1


## 3. Finding missing values

In [6]:
missing_values_sum = global_df.isnull().sum()
print("Total missing values per column across all files:")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(missing_values_sum)

Total missing values per column across all files:
Div                 0
Date                0
Time                0
HomeTeam            0
AwayTeam            0
FTHG                0
FTAG                0
FTR                 0
HTHG               41
HTAG               41
HTR                41
Referee         26394
HS                 44
AS                 44
HST                44
AST                44
HF                 46
AF                 46
HC                 44
AC                 44
HY                 41
AY                 41
HR                 41
AR                 41
B365H             120
B365D             120
B365A             120
BWH              3020
BWD              3020
BWA              3020
IWH             10864
IWD             10864
IWA             10864
PSH               197
PSD               197
PSA               197
WHH              3151
WHD              3151
WHA              3151
VCH              7372
VCD              7372
VCA              7372
MaxH               46
MaxD

In [7]:
missing_values_by_file = {}
for filename, df in all_data.items():
    missing_values_by_file[filename] = df.isnull().sum()

missing_values_df = pd.DataFrame(missing_values_by_file).T

# Define the desired order of columns
desired_columns = ['Date', 'Div', 'FTAG', 'FTHG', 'FTR', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HTR', 'HY', 'HomeTeam']

# Get the remaining columns
remaining_columns = [col for col in missing_values_df.columns if col not in desired_columns]

# Create the new column order
new_column_order = desired_columns + remaining_columns

# Reindex the dataframe with the new column order
missing_values_df = missing_values_df[new_column_order]

print("Missing values per file and column:")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(missing_values_df)

Missing values per file and column:


Unnamed: 0,Date,Div,FTAG,FTHG,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,HomeTeam,1XBA,1XBCA,1XBCD,1XBCH,1XBD,1XBH,AC,AF,AHCh,AHh,AR,AS,AST,AY,Avg<2.5,Avg>2.5,AvgA,AvgAHA,AvgAHH,AvgC<2.5,AvgC>2.5,AvgCA,AvgCAHA,AvgCAHH,AvgCD,AvgCH,AvgD,AvgH,AwayTeam,B365<2.5,B365>2.5,B365A,B365AHA,B365AHH,B365C<2.5,B365C>2.5,B365CA,B365CAHA,B365CAHH,B365CD,B365CH,B365D,B365H,BFA,BFCA,BFCD,BFCH,BFD,BFE<2.5,BFE>2.5,BFEA,BFEAHA,BFEAHH,BFEC<2.5,BFEC>2.5,BFECA,BFECAHA,BFECAHH,BFECD,BFECH,BFED,BFEH,BFH,BWA,BWCA,BWCD,BWCH,BWD,BWH,IWA,IWCA,IWCD,IWCH,IWD,IWH,Max<2.5,Max>2.5,MaxA,MaxAHA,MaxAHH,MaxC<2.5,MaxC>2.5,MaxCA,MaxCAHA,MaxCAHH,MaxCD,MaxCH,MaxD,MaxH,P<2.5,P>2.5,PAHA,PAHH,PC<2.5,PC>2.5,PCAHA,PCAHH,PSA,PSCA,PSCD,PSCH,PSD,PSH,Referee,Time,Unnamed: 105,Unnamed: 106,Unnamed: 119,Unnamed: 120,Unnamed: 121,VCA,VCCA,VCCD,VCCH,VCD,VCH,WHA,WHCA,WHCD,WHCH,WHD,WHH
england_0_2122.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_0_2324.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,2.0,12.0,12.0,12.0,2.0,2.0,182.0,182.0,182.0,182.0,182.0,182.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_0_2425.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,141.0,141.0,141.0,141.0,141.0,141.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,91.0,91.0,91.0,91.0,91.0,91.0
england_0_2021.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_0_2223.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_0_1920.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_1_2122.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_1_2324.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,2.0,14.0,14.0,14.0,2.0,2.0,240.0,240.0,240.0,240.0,240.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
england_1_2425.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,184.0,184.0,184.0,184.0,184.0,184.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,97.0,96.0,96.0,96.0,97.0,97.0
england_1_2021.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Missing values

### 4.1 Dealing with missing values

1.   Položka seznamu
2.   Položka seznamu



## 5. Editing colums

### 5.1 Deleting colums

### 5.2 Agregating colums

### 5.3 Creating new columns

### 5.4 Recreating columns - Rolling statistics

## 6. dealing with outliers

## 7. Encoding

### 7.1 Cheking data types

### 7.2 Label encoding

### 7.3 Ordinal encoding

### 7.4. One-hot encoding

## 8. Creating current dataset and additional variables dataset

## 9. Train-tests split