In [29]:
import pandas as pd

# Load CSV files
match_results_df = pd.read_csv('csv/wedstrijden_onverwerkd.csv')
goal_events_df = pd.read_csv('csv/doelpunten_onverwerkd.csv')
standings_df = pd.read_csv('csv/klassement_onverwerkd.csv')

goal_events_df = goal_events_df[goal_events_df['valid_goal'] == True]

In [30]:
# Bereken goals
goals_home_team = goal_events_df[goal_events_df['goal_team'] == goal_events_df['home_team']].groupby('match_id').size()
goals_away_team = goal_events_df[goal_events_df['goal_team'] == goal_events_df['away_team']].groupby('match_id').size()

# Omzetting
goals_home_team_df = goals_home_team.reset_index(name='calculated_goals_home_team')
goals_away_team_df = goals_away_team.reset_index(name='calculated_goals_away_team')

# Vervang NaN values in goal kolom met 0
merged_df = pd.merge(match_results_df, goals_home_team_df, on='match_id', how='outer')
merged_df = pd.merge(merged_df, goals_away_team_df, on='match_id', how='outer')
merged_df['calculated_goals_home_team'] = merged_df['calculated_goals_home_team'].fillna(0)
merged_df['calculated_goals_away_team'] = merged_df['calculated_goals_away_team'].fillna(0)

# Bereken verschil tussen berekening en resultaat
discrepancies_goals = merged_df[
    (merged_df['calculated_goals_home_team'] != merged_df['result_home_team']) |
    (merged_df['calculated_goals_away_team'] != merged_df['result_away_team'])
]

# Select only relevant columns for display
discrepancies_goals.head(50)

KeyError: 'goal_team'

In [None]:
# controle van aantal_wedstrijden
# Geen enkel record met meer wedstrijden dan speeldagen
number_of_games_check = standings_df[standings_df.dag < standings_df.aantal_wedstrijden]
number_of_games_check.head(30)

In [None]:
# controle van aantal_wedstrijden
# wel records met minder wedstrijden dan speeldagen, maar bij nazicht van een aantal records klopt dit
controle_aantal_wedstrijden = standings_df[standings_df.Day > standings_df.Played]
controle_aantal_wedstrijden.Season.value_counts()

In [None]:
# geen records met standings_df.Played != (standings_df.Wins + standings_df.Losses + standings_df.Draws
controle_aantal_wedstrijden = standings_df[standings_df.Played != (standings_df.Wins + standings_df.Losses + standings_df.Draws)]
controle_aantal_wedstrijden.count()

In [None]:
# Split the 'Goals' column into two separate columns
standings_df[['Goals_Scored', 'Goals_Conceded']] = standings_df['Goals'].str.split(':', expand=True)

standings_df['Goals_Scored'] = pd.to_numeric(standings_df['Goals_Scored'])
standings_df['Goals_Conceded'] = pd.to_numeric(standings_df['Goals_Conceded'])

# geen records met (uitslagen.doelpunten_voor - uitslagen.doelpunten_tegen) != (uitslagen.verschil)
standings_df['Calculated_Goal_Difference'] = standings_df['Goals_Scored'] - standings_df['Goals_Conceded']
controle_aantal_wedstrijden = standings_df[standings_df['Calculated_Goal_Difference'] != standings_df['Goal_Difference']]
controle_aantal_wedstrijden.count()

In [None]:
# lijnen zoeken waarvoor punten geen : bevat (seizoen 1964 + vanaf 1995)
controle_klassement = standings_df[(standings_df.Goals.str.count(':') == 0)]
controle_klassement.head()

In [None]:
# het totaal aantal gewonnen matchen per seizoen en per ploeg
points_df = standings_df.groupby(['Season','Day','Club Number', 'Points'])[['Wins','Draws','Losses']].sum()
points_df = points_df.reset_index()
points_df = points_df.sort_values(['Season','Day','Wins','Draws','Losses'], ascending = [True, True, False, False, False])
points_df.head(30)

In [None]:
points_df['Points_Check'] = points_df['Wins'] * 3 + points_df['Draws'] + points_df['Losses'] * 0
discrepancies_df = points_df[points_df['Points'] != points_df['Points_Check']]
discrepancies_df.head(30)

In [None]:
match_results_df['date'] = pd.to_datetime(match_results_df['date'])
goal_events_df['date'] = pd.to_datetime(goal_events_df['date'])

merged_df = pd.merge(goal_events_df, match_results_df, on='match_id', suffixes=('_goal', '_match'))

# Check for discrepancies in dates
date_discrepancies_df = merged_df[merged_df['date_goal'] != merged_df['date_match']]
date_discrepancies_df.head(30)

In [None]:
# Groepeer de gegevens op 'Day' en controleer de datums
sorted_dates = match_results_df.sort_values(by=['day', 'date'])
date_discrepancies_df = sorted_dates.groupby('day')['date'].apply(lambda x: x.is_monotonic_increasing)

date_discrepancies_df = date_discrepancies_df[date_discrepancies_df == False]
date_discrepancies_df.head(30)


In [None]:
# Sorting the data by season, day, and date
sorted_dates = match_results_df.sort_values(by=['season', 'day', 'date'])

# Group by 'season' and 'day', and check if dates are in chronological order within each group
grouped = sorted_dates.groupby(['season', 'day'])
date_order_check = grouped['date'].apply(lambda x: x.is_monotonic_increasing)

# Identifying seasons and days where dates are not in chronological order
date_discrepancies = date_order_check[date_order_check == False]
date_discrepancies.head(30)

In [None]:
# Filter matches that ended 0-0
matches_0_0 = match_results_df[(match_results_df['result_home_team'] == 0) & (match_results_df['result_away_team'] == 0)]

# Get match_ids of 0-0 matches
match_ids_0_0 = matches_0_0['match_id']

# Check if these matches have any goals in goal_events.csv
goals_in_0_0_matches = goal_events_df[goal_events_df['match_id'].isin(match_ids_0_0)]
goals_in_0_0_matches.head(30)

In [None]:
# Splits de 'Goals' kolom in 'Goals_Scored' en 'Goals_Conceded'
standings_df[['Goals_Scored', 'Goals_Conceded']] = standings_df['Goals'].str.split(':', expand=True)
standings_df['Goals_Scored'] = pd.to_numeric(standings_df['Goals_Scored'])
standings_df['Goals_Conceded'] = pd.to_numeric(standings_df['Goals_Conceded'])

# Bereken de 'Goal_Difference'
standings_df['Calculated_Goal_Difference'] = standings_df['Goals_Scored'] - standings_df['Goals_Conceded']

# Sorteer elke groep en reset index
sorted_df = standings_df.sort_values(['Season', 'Day', 'Points', 'Wins', 'Calculated_Goal_Difference', 'Goals_Scored'], ascending=[True, True, False, False, False, False])

# Voeg een kolom toe voor de berekende rank
sorted_df['Calculated_Rank'] = sorted_df.groupby(['Season', 'Day']).cumcount() + 1

# Voeg de berekende rank toe aan de originele DataFrame
standings_df = standings_df.merge(sorted_df[['Season', 'Day', 'Club', 'Calculated_Rank']], on=['Season', 'Day', 'Club'])

# Controleer op discrepanties
discrepancies = standings_df[standings_df['Rank'] != standings_df['Calculated_Rank']]
discrepancies.head()
