In [1]:
# rerun if needed
# %pip install pandas matplotlib seaborn

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
data = pd.read_csv('teams.csv')
# all numeric columns
numeric_data = data.select_dtypes(include='number')
print(data.columns.tolist())

['team_id', 'team_name', 'season', 'league_id', 'team_form', 'games_played_home', 'games_played_away', 'wins_home', 'losses_home', 'draws_home', 'wins_away', 'losses_away', 'draws_away', 'stadium_capacity', 'team_country', 'goals_scored_0_15', 'goals_scored_16_30', 'goals_scored_31_45', 'goals_scored_46_60', 'goals_scored_61_75', 'goals_scored_76_90', 'goals_scored_91_105', 'goals_scored_106_120', 'goals_conceded_0_15', 'goals_conceded_16_30', 'goals_conceded_31_45', 'goals_conceded_46_60', 'goals_conceded_61_75', 'goals_conceded_76_90', 'goals_conceded_91_105', 'goals_conceded_106_120', 'clean_sheets', 'failed_to_score', 'penalty_success_rate', 'over_0_5', 'under_0_5', 'over_1_5', 'under_1_5', 'over_2_5', 'under_2_5', 'over_3_5', 'under_3_5', 'over_4_5', 'under_4_5', 'yellow_cards_0_15', 'yellow_cards_16_30', 'yellow_cards_31_45', 'yellow_cards_46_60', 'yellow_cards_61_75', 'yellow_cards_76_90', 'yellow_cards_91_105', 'yellow_cards_106_120', 'red_cards_0_15', 'red_cards_16_30', 'red_c

In [2]:
# Count the number of missing values in each column
missing_values = data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)

team_form           206
team_country        666
lineups_per_game    666
dtype: int64


In [5]:
# after finding 206 null team_forms - we'll delte those rows - this teams weren't in the league at this season

# Assuming 'df' is your DataFrame
updated_data = data.loc[~((data['games_played_home'] == 0) & 
                       (data['games_played_away'] == 0) & 
                       (data['wins_home'] == 0) & 
                       (data['losses_home'] == 0) & 
                       (data['draws_home'] == 0) & 
                       (data['wins_away'] == 0) & 
                       (data['losses_away'] == 0) & 
                       (data['draws_away'] == 0) & 
                       data['team_form'].isnull())]

# remove the column lineups_per_game - it includes at each fixture
updated_data = updated_data.drop(columns=['lineups_per_game'])

#checking
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)




team_country    460
dtype: int64


In [7]:
# create dict for each league and it's country to fill team_country

team_country_dict = {39: 'England', 140:'Spain'}
updated_data['team_country'] = updated_data['league_id'].map(team_country_dict)

#checking
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)


Series([], dtype: int64)
     team_id        team_name  season  league_id  \
658      536          Sevilla    2020        140   
662      530  Atletico Madrid    2020        140   
663      548    Real Sociedad    2018        140   
664      546           Getafe    2020        140   
665      529        Barcelona    2020        140   

                                  team_form  games_played_home  \
658  WWDLLLWWWLWDWWDWLWWWWWWLLWWDWWWWWLDWLW                 19   
662  WDDWWWWWWWLWWWWWWWWDWDLWDWDWLDWWLWDWWW                 19   
663  WDLLWDLWDLDWWLLLLWWDDWDWDLLDDWLDLLWWWL                 19   
664  WDWLWLDLDDLLWDLLWWLDLLLLWLDDDLDLWLLLWD                 19   
665  WWDLLDWLWLWWDWDWWWWWWWDWWWWWWLWWLWDDLW                 19   

     games_played_away  wins_home  losses_home  draws_home  ...  \
658                 19         14            4           1  ...   
662                 19         15            1           3  ...   
663                 19          7            6           6  ... 

In [8]:
# check how many teams in each season
# Group by 'season' and count unique 'team_id'
team_count_per_season = updated_data.groupby('season')['team_id'].nunique()

# Print the result
print(team_count_per_season)

season
2010    20
2011    20
2012    20
2013    20
2014    20
2015    20
2016    20
2017    40
2018    40
2019    40
2020    40
2021    40
2022    40
2023    40
2024    40
Name: team_id, dtype: int64


Encode all the non numeric columns

In [10]:
# print all the non numeric columns
# Select columns that are non-numeric
non_numeric_data = updated_data.select_dtypes(exclude='number')

# Print the non-numeric data
print(non_numeric_data.columns.tolist())

# drop the column of team_name - we have team_id it's enough




['team_name', 'team_form', 'team_country']
