In [347]:
import pandas as pd

# Load CSV files
match_results_df = pd.read_csv('csv/match_results.csv')
goal_events_df = pd.read_csv('csv/goal_events_verwerkt.csv')
standings_df = pd.read_csv('csv/standings.csv')

team_name_mapping = {
    'Antwerp FC': 'R Antwerp FC',
    'Cercle': 'Cercle Brugge',
    'Club Brugge': 'Club Brugge',
    'Eupen': 'KAS Eupen',
    'FCB': 'Club Brugge',
    'Genk': 'KRC Genk',
    'Gent': 'KAA Gent',
    'KAA Gent': 'KAA Gent',
    'KAS Eupen': 'KAS Eupen',
    'KV Kortrijk': 'KV Kortrijk',
    'KV Mechelen': 'KV Mechelen',
    'KVC Westerlo': 'KVC Westerlo',
    'KVCWes': 'KVC Westerlo',
    'KVK': 'KV Kortrijk',
    'KVM': 'KV Mechelen',
    'OH Leuven': 'Oud-Heverlee Leuven',
    'OHL': 'Oud-Heverlee Leuven',
    'R Charleroi SC': 'R Charleroi SC',
    'RAFC': 'R Antwerp FC',
    'RCSC': 'R Charleroi SC',
    'RSC Anderlecht': 'RSC Anderlecht',
    'RSCA': 'RSC Anderlecht',
    'RUSG': 'Union Saint-Gilloise',
    'RWDM': 'RWD Molenbeek',
    'SL': 'Standard Luik',
    'STVV': 'Sint-Truidense VV',
    'St-Truidense VV': 'Sint-Truidense VV',
    'Standard Luik': 'Standard Luik',
    'Union SG': 'Union Saint-Gilloise'
}

goal_events_df = goal_events_df[goal_events_df['valid_goal'] == True]

In [348]:
# Calculate goals for home and away teams
goals_home_team = goal_events_df[goal_events_df['goal_team'] == goal_events_df['home_team']].groupby('match_id').size()
goals_away_team = goal_events_df[goal_events_df['goal_team'] == goal_events_df['away_team']].groupby('match_id').size()

# Convert Series to DataFrame and reset index
goals_home_team_df = goals_home_team.reset_index(name='calculated_goals_home_team')
goals_away_team_df = goals_away_team.reset_index(name='calculated_goals_away_team')

# Replace NaN values in goal columns with 0
merged_df = pd.merge(match_results_df, goals_home_team_df, on='match_id', how='outer')
merged_df = pd.merge(merged_df, goals_away_team_df, on='match_id', how='outer')
merged_df['calculated_goals_home_team'] = merged_df['calculated_goals_home_team'].fillna(0)
merged_df['calculated_goals_away_team'] = merged_df['calculated_goals_away_team'].fillna(0)


# Calculate discrepancies in goals vs result
discrepancies = merged_df[
    (merged_df['calculated_goals_home_team'] != merged_df['result_home_team']) |
    (merged_df['calculated_goals_away_team'] != merged_df['result_away_team'])
]

# Select only relevant columns for display
discrepancies = discrepancies[['date', 'match_id', 'home_team', 'away_team', 'calculated_goals_home_team', 'result_home_team', 'calculated_goals_away_team', 'result_away_team']]
discrepancies.head(50)

Unnamed: 0,date,match_id,home_team,away_team,calculated_goals_home_team,result_home_team,calculated_goals_away_team,result_away_team
132,2007/12/08,79993,FC Brussel,KRC Genk,0.0,0,0.0,5
154,2008/01/19,80019,RSC Anderlecht,KV Mechelen,0.0,1,1.0,0


In [349]:
# controle van aantal_wedstrijden
# Geen enkel record met meer wedstrijden dan speeldagen
controle_aantal_wedstrijden = standings_df[standings_df.Day < standings_df.Played]
controle_aantal_wedstrijden.head(30)

Unnamed: 0,Rank,Club,Club Number,Played,Wins,Draws,Losses,Goals,Goal_Difference,Points,Season,Day


In [350]:
# controle van aantal_wedstrijden
# wel records met minder wedstrijden dan speeldagen, maar bij nazicht van een aantal records klopt dit
controle_aantal_wedstrijden = standings_df[standings_df.Day > standings_df.Played]
controle_aantal_wedstrijden.Season.value_counts()

Series([], Name: Season, dtype: int64)

In [351]:
# geen records met standings_df.Played != (standings_df.Wins + standings_df.Losses + standings_df.Draws
controle_aantal_wedstrijden = standings_df[standings_df.Played != (standings_df.Wins + standings_df.Losses + standings_df.Draws)]
controle_aantal_wedstrijden.count()

Rank               0
Club               0
Club Number        0
Played             0
Wins               0
Draws              0
Losses             0
Goals              0
Goal_Difference    0
Points             0
Season             0
Day                0
dtype: int64

In [352]:
# Split the 'Goals' column into two separate columns
standings_df[['Goals_Scored', 'Goals_Conceded']] = standings_df['Goals'].str.split(':', expand=True)

standings_df['Goals_Scored'] = pd.to_numeric(standings_df['Goals_Scored'])
standings_df['Goals_Conceded'] = pd.to_numeric(standings_df['Goals_Conceded'])

# geen records met (uitslagen.doelpunten_voor - uitslagen.doelpunten_tegen) != (uitslagen.verschil)
standings_df['Calculated_Goal_Difference'] = standings_df['Goals_Scored'] - standings_df['Goals_Conceded']
controle_aantal_wedstrijden = standings_df[standings_df['Calculated_Goal_Difference'] != standings_df['Goal_Difference']]
controle_aantal_wedstrijden.count()

Rank                          0
Club                          0
Club Number                   0
Played                        0
Wins                          0
Draws                         0
Losses                        0
Goals                         0
Goal_Difference               0
Points                        0
Season                        0
Day                           0
Goals_Scored                  0
Goals_Conceded                0
Calculated_Goal_Difference    0
dtype: int64

In [353]:
# lijnen zoeken waarvoor punten geen : bevat (seizoen 1964 + vanaf 1995)
controle_klassement = standings_df[(standings_df.Goals.str.count(':') == 0)]
controle_klassement.head()

Unnamed: 0,Rank,Club,Club Number,Played,Wins,Draws,Losses,Goals,Goal_Difference,Points,Season,Day,Goals_Scored,Goals_Conceded,Calculated_Goal_Difference


In [354]:
# het totaal aantal gewonnen matchen per seizoen en per ploeg
points_df = standings_df.groupby(['Season','Day','Club Number', 'Points'])[['Wins','Draws','Losses']].sum()
points_df = points_df.reset_index()
points_df = points_df.sort_values(['Season','Day','Wins','Draws','Losses'], ascending = [True, True, False, False, False])
points_df.head(30)

Unnamed: 0,Season,Day,Club Number,Points,Wins,Draws,Losses
2,2007,1,16,3,1,0,0
5,2007,1,290,3,1,0,0
6,2007,1,3,3,1,0,0
7,2007,1,35,3,1,0,0
9,2007,1,7,3,1,0,0
10,2007,1,867,3,1,0,0
13,2007,1,Germ. Beerschot,3,1,0,0
14,2007,1,KSC Lokeren,3,1,0,0
1,2007,1,134,1,0,1,0
15,2007,1,R Charleroi SC,1,0,1,0


In [355]:
points_df['Points_Check'] = points_df['Wins'] * 3 + points_df['Draws'] + points_df['Losses'] * 0
discrepancies_df = points_df[points_df['Points'] != points_df['Points_Check']]
discrepancies_df.head(30)

Unnamed: 0,Season,Day,Club Number,Points,Wins,Draws,Losses,Points_Check


In [356]:
match_results_df['date'] = pd.to_datetime(match_results_df['date'])
goal_events_df['date'] = pd.to_datetime(goal_events_df['date'])

merged_df = pd.merge(goal_events_df, match_results_df, on='match_id', suffixes=('_goal', '_match'))

# Check for discrepancies in dates
date_discrepancies_df = merged_df[merged_df['date_goal'] != merged_df['date_match']]
date_discrepancies_df.head(30)

Unnamed: 0,date_goal,time_goal,home_team_goal,home_team_number_goal,away_team_goal,away_team_number_goal,goal_team,goal_time,result_home_team_goal,result_away_team_goal,...,date_match,time_match,home_team_match,home_team_number_match,result_home_team_match,result_away_team_match,away_team_match,away_team_number_match,season_match,day_match


In [357]:
# Groepeer de gegevens op 'Day' en controleer de datums
sorted_dates = match_results_df.sort_values(by=['day', 'date'])
date_discrepancies_df = sorted_dates.groupby('day')['date'].apply(lambda x: x.is_monotonic_increasing)

date_discrepancies_df = date_discrepancies_df[date_discrepancies_df == False]
date_discrepancies_df.head(30)


Series([], Name: date, dtype: bool)

In [358]:
# Sorting the data by season, day, and date
sorted_dates = match_results_df.sort_values(by=['season', 'day', 'date'])

# Group by 'season' and 'day', and check if dates are in chronological order within each group
grouped = sorted_dates.groupby(['season', 'day'])
date_order_check = grouped['date'].apply(lambda x: x.is_monotonic_increasing)

# Identifying seasons and days where dates are not in chronological order
date_discrepancies = date_order_check[date_order_check == False]
date_discrepancies.head(30)

Series([], Name: date, dtype: bool)

In [359]:
# Filter matches that ended 0-0
matches_0_0 = match_results_df[(match_results_df['result_home_team'] == 0) & (match_results_df['result_away_team'] == 0)]

# Get match_ids of 0-0 matches
match_ids_0_0 = matches_0_0['match_id']

# Check if these matches have any goals in goal_events.csv
goals_in_0_0_matches = goal_events_df[goal_events_df['match_id'].isin(match_ids_0_0)]
goals_in_0_0_matches.head(30)

Unnamed: 0,date,time,home_team,home_team_number,away_team,away_team_number,goal_team,goal_time,result_home_team,result_away_team,valid_goal,season,day,match_id,real_time_goal


In [360]:
# Splits de 'Goals' kolom in 'Goals_Scored' en 'Goals_Conceded'
standings_df[['Goals_Scored', 'Goals_Conceded']] = standings_df['Goals'].str.split(':', expand=True)
standings_df['Goals_Scored'] = pd.to_numeric(standings_df['Goals_Scored'])
standings_df['Goals_Conceded'] = pd.to_numeric(standings_df['Goals_Conceded'])

# Bereken de 'Goal_Difference'
standings_df['Calculated_Goal_Difference'] = standings_df['Goals_Scored'] - standings_df['Goals_Conceded']

# Sorteer elke groep en reset index
sorted_df = standings_df.sort_values(['Season', 'Day', 'Points', 'Wins', 'Calculated_Goal_Difference', 'Goals_Scored'], ascending=[True, True, False, False, False, False])

# Voeg een kolom toe voor de berekende rank
sorted_df['Calculated_Rank'] = sorted_df.groupby(['Season', 'Day']).cumcount() + 1

# Voeg de berekende rank toe aan de originele DataFrame
standings_df = standings_df.merge(sorted_df[['Season', 'Day', 'Club', 'Calculated_Rank']], on=['Season', 'Day', 'Club'])

# Controleer op discrepanties
discrepancies = standings_df[standings_df['Rank'] != standings_df['Calculated_Rank']]
discrepancies.head()


Unnamed: 0,Rank,Club,Club Number,Played,Wins,Draws,Losses,Goals,Goal_Difference,Points,Season,Day,Goals_Scored,Goals_Conceded,Calculated_Goal_Difference,Calculated_Rank
1,1,KAA Gent,7,1,1,0,0,4:1,3,3,2007,1,4,1,3,2
4,4,Club Brugge,3,1,1,0,0,2:1,1,3,2007,1,2,1,1,5
5,4,KSC Lokeren,KSC Lokeren,1,1,0,0,2:1,1,3,2007,1,2,1,1,6
7,7,RSC Anderlecht,35,1,1,0,0,1:0,1,3,2007,1,1,0,1,8
9,9,KSV Roeselare,134,1,0,1,0,1:1,0,1,2007,1,1,1,0,10
