In [107]:
# rerun if needed
# %pip install pandas matplotlib seaborn

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
data = pd.read_csv('teams.csv')
# all numeric columns
numeric_data = data.select_dtypes(include='number')
print(data.columns.tolist())

['team_id', 'team_name', 'season', 'league_id', 'team_form', 'games_played_home', 'games_played_away', 'wins_home', 'losses_home', 'draws_home', 'wins_away', 'losses_away', 'draws_away', 'stadium_capacity', 'team_country', 'goals_scored_0_15', 'goals_scored_16_30', 'goals_scored_31_45', 'goals_scored_46_60', 'goals_scored_61_75', 'goals_scored_76_90', 'goals_scored_91_105', 'goals_scored_106_120', 'goals_conceded_0_15', 'goals_conceded_16_30', 'goals_conceded_31_45', 'goals_conceded_46_60', 'goals_conceded_61_75', 'goals_conceded_76_90', 'goals_conceded_91_105', 'goals_conceded_106_120', 'clean_sheets', 'failed_to_score', 'penalty_success_rate', 'over_0_5', 'under_0_5', 'over_1_5', 'under_1_5', 'over_2_5', 'under_2_5', 'over_3_5', 'under_3_5', 'over_4_5', 'under_4_5', 'yellow_cards_0_15', 'yellow_cards_16_30', 'yellow_cards_31_45', 'yellow_cards_46_60', 'yellow_cards_61_75', 'yellow_cards_76_90', 'yellow_cards_91_105', 'yellow_cards_106_120', 'red_cards_0_15', 'red_cards_16_30', 'red_c

In [108]:
# Count the number of missing values in each column
missing_values = data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)

team_form           206
team_country        666
lineups_per_game    666
dtype: int64


In [109]:
# after finding 206 null team_forms - we'll delte those rows - this teams weren't in the league at this season

# Assuming 'df' is your DataFrame
updated_data = data.loc[~((data['games_played_home'] == 0) & 
                       (data['games_played_away'] == 0) & 
                       (data['wins_home'] == 0) & 
                       (data['losses_home'] == 0) & 
                       (data['draws_home'] == 0) & 
                       (data['wins_away'] == 0) & 
                       (data['losses_away'] == 0) & 
                       (data['draws_away'] == 0) & 
                       data['team_form'].isnull())]

# remove the column lineups_per_game - it includes at each fixture
updated_data = updated_data.drop(columns=['lineups_per_game'])

#checking
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)




team_country    460
dtype: int64


In [110]:
# create dict for each league and it's country to fill team_country

team_country_dict = {39: 'England', 140:'Spain'}
updated_data['team_country'] = updated_data['league_id'].map(team_country_dict)

#checking
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)


Series([], dtype: int64)


In [111]:
# check how many teams in each season
# Group by 'season' and count unique 'team_id'
team_count_per_season = updated_data.groupby('season')['team_id'].nunique()

# Print the result
print(team_count_per_season)

season
2010    20
2011    20
2012    20
2013    20
2014    20
2015    20
2016    20
2017    40
2018    40
2019    40
2020    40
2021    40
2022    40
2023    40
2024    40
Name: team_id, dtype: int64


Encode all the non numeric columns

In [112]:
# print all the non numeric columns
# Select columns that are non-numeric
non_numeric_data = updated_data.select_dtypes(exclude='number')

# Print the non-numeric data
print(non_numeric_data.columns.tolist())

# drop the column of team_name - we have team_id it's enough
updated_data = updated_data.drop(columns=['team_name'])

# drop games_played_home  games_played_away - not relevant - all teams play the same amount of games + it will have good indicator on the fixtures table
updated_data = updated_data.drop(columns=['games_played_home','games_played_away'])


['team_name', 'team_form', 'team_country']


In [113]:
# encode the team_country by get_dummies
dummies = pd.get_dummies(updated_data['team_country'], prefix='team_country')
#  include these dummy variables back into original DataFrame and drop the original column
updated_data = pd.concat([updated_data, dummies], axis=1)
updated_data.drop('team_country', axis=1, inplace=True)
# convert in into int and not true/false
dummy_cols = [col for col in updated_data.columns if any(prefix in col for prefix in ['team_country_'])]
updated_data[dummy_cols] = updated_data[dummy_cols].astype(int)

# To see the encoded DataFrame
print(updated_data.columns.tolist())


['team_id', 'season', 'league_id', 'team_form', 'wins_home', 'losses_home', 'draws_home', 'wins_away', 'losses_away', 'draws_away', 'stadium_capacity', 'goals_scored_0_15', 'goals_scored_16_30', 'goals_scored_31_45', 'goals_scored_46_60', 'goals_scored_61_75', 'goals_scored_76_90', 'goals_scored_91_105', 'goals_scored_106_120', 'goals_conceded_0_15', 'goals_conceded_16_30', 'goals_conceded_31_45', 'goals_conceded_46_60', 'goals_conceded_61_75', 'goals_conceded_76_90', 'goals_conceded_91_105', 'goals_conceded_106_120', 'clean_sheets', 'failed_to_score', 'penalty_success_rate', 'over_0_5', 'under_0_5', 'over_1_5', 'under_1_5', 'over_2_5', 'under_2_5', 'over_3_5', 'under_3_5', 'over_4_5', 'under_4_5', 'yellow_cards_0_15', 'yellow_cards_16_30', 'yellow_cards_31_45', 'yellow_cards_46_60', 'yellow_cards_61_75', 'yellow_cards_76_90', 'yellow_cards_91_105', 'yellow_cards_106_120', 'red_cards_0_15', 'red_cards_16_30', 'red_cards_31_45', 'red_cards_46_60', 'red_cards_61_75', 'red_cards_76_90', '

to deal with team form - we will convert each form to ternary num and then to decimal - L=0,D=1,W=2

In [114]:
# dict of LDW values
form_dict = {'L':0, 'D':1, 'W':3}

def process_team_form(row):
    # Replace each character in the string according to the dictionary
    form_numeric_string = ''.join(str(form_dict[char]) for char in row['team_form'])
    return int(form_numeric_string)  

updated_data_copy = updated_data.copy()

# Apply the function to the DataFrame
updated_data_copy['team_form_ternary'] = updated_data_copy.apply(process_team_form, axis=1)

# update it on the origin df
# Apply the function to the team_form column and create a new team_form_decimal column
updated_data['team_form_ternary'] = updated_data_copy[['team_form_ternary']]

# Drop the original team_form column
updated_data.drop('team_form', axis=1, inplace=True)

print(updated_data['team_form_ternary'])





0      10133133000100333013003331103103013330
2      30101313000330003300003313311033131130
3         10013013130300330013110113000303000
4        331103330310113333031330301330030133
5       3031033331331333130131131133333130013
                        ...                  
658    33100033303133130333333003313333301303
662    31133333330333333331310313130133031333
663    31003103101330000331131310011301003330
664    31303010110031003301000030111010300031
665    33100130303313133333331333333033031103
Name: team_form_ternary, Length: 460, dtype: object


normalize the integer values

In [115]:
print(updated_data.head())

# normalize stadium capacity with min-max - between 0-1 - not standartization cause I can't assume it normally distribute
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
min_max_scaler = MinMaxScaler()

# Fit and transform 'stadium_capacity'
updated_data['stadium_capacity_normalized'] = min_max_scaler.fit_transform(updated_data[['stadium_capacity']])

print(updated_data['stadium_capacity_normalized'])


   team_id  season  league_id  wins_home  losses_home  draws_home  wins_away  \
0       39    2018         39         10            5           4          6   
2       44    2019         39          8            7           4          7   
3       44    2020         39          4            9           6          6   
4       48    2020         39         10            5           4          9   
5       33    2020         39          9            6           4         12   

   losses_away  draws_away  stadium_capacity  ...  red_cards_16_30  \
0            8           5             34624  ...                1   
2            7           5             22546  ...                0   
3           10           3             22546  ...                0   
4            6           4             62500  ...                0   
5            0           7             76212  ...                1   

   red_cards_31_45  red_cards_46_60  red_cards_61_75  red_cards_76_90  \
0                0       