In [36]:
# rerun if needed
# %pip install pandas matplotlib seaborn

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
data = pd.read_csv('fixtures.csv')
# all numeric columns
numeric_data = data.select_dtypes(include='number')
print(data.columns.tolist())

['fixture_id', 'league_id', 'league_name', 'home_team_id', 'home_team_name', 'away_team_id', 'away_team_name', 'goals_half_time_home', 'goals_half_time_away', 'goals_full_time_home', 'goals_full_time_away', 'goals_extra_time_home', 'goals_extra_time_away', 'goals_penalty_home', 'goals_penalty_away', 'league_round', 'expected_goals_home', 'shots_on_goal_home', 'shots_off_goal_home', 'shots_insidebox_home', 'shots_outsidebox_home', 'total_shots_home', 'blocked_shots_home', 'fouls_home', 'corner_kicks_home', 'offsides_home', 'ball_possession_home', 'yellow_cards_home', 'red_cards_home', 'goalkeeper_saves_home', 'total_passes_home', 'passes_accurate_home', 'passes_percentage_home', 'expected_goals_away', 'shots_on_goal_away', 'shots_off_goal_away', 'shots_insidebox_away', 'shots_outsidebox_away', 'total_shots_away', 'blocked_shots_away', 'fouls_away', 'corner_kicks_away', 'offsides_away', 'ball_possession_away', 'yellow_cards_away', 'red_cards_away', 'goalkeeper_saves_away', 'total_passes_

In [37]:
# Count the number of missing values in each column
missing_values = data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)
# check how much rows in the db
# num_rows = data.count()
# print(num_rows)

goals_half_time_home       1086
goals_half_time_away       1086
goals_full_time_home       1085
goals_full_time_away       1085
goals_extra_time_home     19712
goals_extra_time_away     19712
goals_penalty_home        19713
goals_penalty_away        19713
expected_goals_home       16513
shots_on_goal_home         2976
shots_off_goal_home        2975
shots_insidebox_home       3073
shots_outsidebox_home      3075
total_shots_home           3073
blocked_shots_home         3110
fouls_home                 2975
corner_kicks_home          2975
offsides_home              3597
ball_possession_home       2975
yellow_cards_home          3460
red_cards_home            12460
goalkeeper_saves_home      3049
total_passes_home          3073
passes_accurate_home       3073
passes_percentage_home     3073
expected_goals_away       16513
shots_on_goal_away         2978
shots_off_goal_away        2977
shots_insidebox_away       3073
shots_outsidebox_away      3075
total_shots_away           3073
blocked_

In [38]:
# we don't need the coaches columns for now, so drop it
updated_data = data.drop(columns=['coach_home_team','coach_away_team'])

# check the goalkeeper saves missing values
filtered_fixtures = updated_data[updated_data['goalkeeper_saves_home'].isna() | updated_data['goalkeeper_saves_away'].isna()]

# Print the fixture IDs where both columns are missing
print("Fixture IDs where both goalkeeper_saves_home and goalkeeper_saves_away are missing:")
print(filtered_fixtures['fixture_id'].tolist())

# seems like we need to fill with 0
# Fill missing values in the specified columns with 0
updated_data['goalkeeper_saves_home'] = updated_data['goalkeeper_saves_home'].fillna(0)
updated_data['goalkeeper_saves_away'] = updated_data['goalkeeper_saves_away'].fillna(0)

missing_values = updated_data.isnull().sum()
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)

missing_fixtures_with_red_cards_data = []
missing_fixtures_with_yellow_cards_data = []
missing_fixtures_with_offsides_data = []

missing_values_lsts= [missing_fixtures_with_red_cards_data,missing_fixtures_with_yellow_cards_data,missing_fixtures_with_offsides_data]

# columns_to_check = ['offsides', 'yellow_cards', 'red_cards']
# for column in columns_to_check:
#     if updated_data[f'{column}_home'].isna().any() and updated_data[f'{column}_away'].isna().any():
#         lst_name = f"missing_fixtures_with_{column}_data"
#         missing_values_lsts[columns_to_check.index(column)].append(column)
# Create boolean masks for the missing values for each of the specified column combinations
yellow_cards_home_missing = updated_data['yellow_cards_home'].isna()
yellow_cards_away_missing = updated_data['yellow_cards_away'].isna()

red_cards_home_missing = updated_data['red_cards_home'].isna()
red_cards_away_missing = updated_data['red_cards_away'].isna()

offsides_home_missing = updated_data['offsides_home'].isna()
offsides_away_missing = updated_data['offsides_away'].isna()

# Combination 1: Check if both 'yellow_cards_home' and 'yellow_cards_away' are missing
yellow_cards_combination_missing = updated_data[
    yellow_cards_home_missing & yellow_cards_away_missing
]

# Combination 2: Check if both 'red_cards_home' and 'red_cards_away' are missing
red_cards_combination_missing = updated_data[
    red_cards_home_missing & red_cards_away_missing
]

# Combination 3: Check if both 'offsides_home' and 'offsides_away' are missing
offsides_combination_missing = updated_data[
    offsides_home_missing & offsides_away_missing
]

# Count the fixtures where both columns in each combination have missing values
yellow_cards_combination_fixtures = yellow_cards_combination_missing['fixture_id'].unique()
red_cards_combination_fixtures = red_cards_combination_missing['fixture_id'].unique()
offsides_combination_fixtures = offsides_combination_missing['fixture_id'].unique()

# Print the fixture IDs for each combination
print(f"Fixtures with missing yellow cards (home and away): {yellow_cards_combination_fixtures.tolist()}")
print(f"Fixtures with missing red cards (home and away): {red_cards_combination_fixtures.tolist()}")
print(f"Fixtures with missing offsides (home and away): {offsides_combination_fixtures.tolist()}")

# Count the number of fixtures with missing data in each combination
print(f"Number of fixtures with missing yellow cards (home and away): {len(yellow_cards_combination_fixtures)}")
print(f"Number of fixtures with missing red cards (home and away): {len(red_cards_combination_fixtures)}")
print(f"Number of fixtures with missing offsides (home and away): {len(offsides_combination_fixtures)}")
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)


Fixture IDs where both goalkeeper_saves_home and goalkeeper_saves_away are missing:
[193232, 193071, 1208170, 194214, 194212, 1208139, 1208133, 1208137, 1208134, 1208136, 1208135, 1208140, 1208142, 1208138, 1208141, 1208144, 1208152, 1208146, 1208143, 1208149, 1208151, 1208148, 1208150, 1208145, 1208147, 1208158, 1208159, 1208161, 1208162, 1208156, 1208160, 1208154, 1208155, 1208157, 1208153, 1208166, 1208165, 1208164, 1208163, 1208167, 1208169, 1208168, 1208171, 1208172, 1208179, 1208182, 1208177, 1208174, 1208180, 1208175, 1208178, 1208181, 1208176, 1208173, 1208183, 1208192, 1208184, 1208188, 1208185, 1208190, 1208187, 1208186, 1208189, 1208191, 1208198, 1208199, 1208193, 1208201, 1208196, 1208200, 1208202, 1208197, 1208195, 1208194, 1208209, 1208207, 1208206, 1208211, 1208205, 1208212, 1208208, 1208203, 1208210, 1208204, 1208221, 1208213, 1208220, 1208219, 1208216, 1208214, 1208215, 1208217, 1208218, 1208222, 1208229, 1208230, 1208224, 1208228, 1208231, 1208225, 1208227, 1208223, 1

In [39]:
# List of columns you have already checked (you don't want them in the missing values summary)
checked_columns = [
    'yellow_cards_home', 'yellow_cards_away',
    'red_cards_home', 'red_cards_away',
    'offsides_home', 'offsides_away','goals_extra_time_home','goals_extra_time_away','goals_penalty_home','goals_penalty_away'
]

# Exclude the checked columns from the missing values summary
missing_values = updated_data.drop(columns=checked_columns).isnull().sum()

# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]

# Print the fixture IDs for the missing 'goalkeeper_saves_home' and 'goalkeeper_saves_away' (if you haven't already checked)
goalkeeper_saves_combination_missing = updated_data[
    updated_data['goalkeeper_saves_home'].isna() & updated_data['goalkeeper_saves_away'].isna()
]
goalkeeper_saves_combination_fixtures = goalkeeper_saves_combination_missing['fixture_id'].unique()

# Output the fixture IDs
print("Fixture IDs where both goalkeeper_saves_home and goalkeeper_saves_away are missing:")
print(goalkeeper_saves_combination_fixtures.tolist())

# Output the missing value summary, excluding the already checked columns
print("\nMissing values summary (after excluding checked columns):")
print(non_zero_missing)


Fixture IDs where both goalkeeper_saves_home and goalkeeper_saves_away are missing:
[]

Missing values summary (after excluding checked columns):
goals_half_time_home       1086
goals_half_time_away       1086
goals_full_time_home       1085
goals_full_time_away       1085
expected_goals_home       16513
shots_on_goal_home         2976
shots_off_goal_home        2975
shots_insidebox_home       3073
shots_outsidebox_home      3075
total_shots_home           3073
blocked_shots_home         3110
fouls_home                 2975
corner_kicks_home          2975
ball_possession_home       2975
total_passes_home          3073
passes_accurate_home       3073
passes_percentage_home     3073
expected_goals_away       16513
shots_on_goal_away         2978
shots_off_goal_away        2977
shots_insidebox_away       3073
shots_outsidebox_away      3075
total_shots_away           3073
blocked_shots_away         3112
fouls_away                 2977
corner_kicks_away          2977
ball_possession_away  

In [40]:
# Check data types of each column
print("Column Data Types:")
print(updated_data.dtypes)

# Identify columns that are not numeric
non_numeric_columns = updated_data.select_dtypes(exclude=['number']).columns
print("\nNon-Numeric Columns:")
print(non_numeric_columns)

# Inspect non-numeric columns
for col in non_numeric_columns:
    print(f"\nValues in Column '{col}':")
    print(updated_data[col].unique())


Column Data Types:
fixture_id                  int64
league_id                   int64
league_name                object
home_team_id                int64
home_team_name             object
away_team_id                int64
away_team_name             object
goals_half_time_home      float64
goals_half_time_away      float64
goals_full_time_home      float64
goals_full_time_away      float64
goals_extra_time_home     float64
goals_extra_time_away     float64
goals_penalty_home        float64
goals_penalty_away        float64
league_round               object
expected_goals_home       float64
shots_on_goal_home        float64
shots_off_goal_home       float64
shots_insidebox_home      float64
shots_outsidebox_home     float64
total_shots_home          float64
blocked_shots_home        float64
fouls_home                float64
corner_kicks_home         float64
offsides_home             float64
ball_possession_home      float64
yellow_cards_home         float64
red_cards_home            flo

In [41]:
# Drop the specified columns from updated_data
columns_to_drop = [
    'league_name', 'home_team_name', 'away_team_name',
    'league_round', 'formation_home', 'formation_away', 'goals_extra_time_home','goals_extra_time_away', 'goals_penalty_home', 'goals_penalty_away'
]

# Ensure the columns exist in updated_data before dropping
updated_data = updated_data.drop(columns=[col for col in columns_to_drop if col in updated_data.columns])

# Print the list of remaining column names
print("Remaining columns:")
print(len(updated_data.columns.tolist()))
print(len(updated_data))
# Count the number of missing values in each column
missing_values = updated_data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)


Remaining columns:
46
19715
goals_half_time_home       1086
goals_half_time_away       1086
goals_full_time_home       1085
goals_full_time_away       1085
expected_goals_home       16513
shots_on_goal_home         2976
shots_off_goal_home        2975
shots_insidebox_home       3073
shots_outsidebox_home      3075
total_shots_home           3073
blocked_shots_home         3110
fouls_home                 2975
corner_kicks_home          2975
offsides_home              3597
ball_possession_home       2975
yellow_cards_home          3460
red_cards_home            12460
total_passes_home          3073
passes_accurate_home       3073
passes_percentage_home     3073
expected_goals_away       16513
shots_on_goal_away         2978
shots_off_goal_away        2977
shots_insidebox_away       3073
shots_outsidebox_away      3075
total_shots_away           3073
blocked_shots_away         3112
fouls_away                 2977
corner_kicks_away          2977
offsides_away              3599
ball_possess

In [42]:
# Set the correlation threshold
correlation_threshold = 0.75

# Columns to include in the correlation matrix
columns_to_include = [
    'fixture_id', 'league_id', 'home_team_id', 'away_team_id',
    'goals_half_time_home', 'goals_half_time_away', 'goals_full_time_home', 'goals_full_time_away',
    'expected_goals_home', 'shots_on_goal_home', 'shots_off_goal_home', 'shots_insidebox_home',
    'shots_outsidebox_home', 'total_shots_home', 'blocked_shots_home', 'fouls_home',
    'corner_kicks_home', 'offsides_home', 'ball_possession_home', 'yellow_cards_home',
    'red_cards_home', 'goalkeeper_saves_home', 'total_passes_home', 'passes_accurate_home',
    'passes_percentage_home', 'expected_goals_away', 'shots_on_goal_away', 'shots_off_goal_away',
    'shots_insidebox_away', 'shots_outsidebox_away', 'total_shots_away', 'blocked_shots_away',
    'fouls_away', 'corner_kicks_away', 'offsides_away', 'ball_possession_away', 'yellow_cards_away',
    'red_cards_away', 'goalkeeper_saves_away', 'total_passes_away', 'passes_accurate_away',
    'passes_percentage_away'
]

# Ensure the selected columns exist in updated_data
filtered_data = updated_data[columns_to_include]

# Calculate the correlation matrix
correlation_matrix = filtered_data.corr()

# Stack the correlation matrix to get pairs of columns with correlation values
high_correlation_pairs = (
    correlation_matrix
    .stack()  # Convert the correlation matrix to a long format
    .reset_index()  # Convert to a DataFrame
    .rename(columns={0: 'correlation', 'level_0': 'column_1', 'level_1': 'column_2'})  # Rename columns
)

# Filter for correlations above the threshold and exclude self-correlations
high_correlation_pairs = high_correlation_pairs[
    (high_correlation_pairs['correlation'].abs() >= correlation_threshold) &
    (high_correlation_pairs['column_1'] != high_correlation_pairs['column_2'])
]

# Remove duplicate pairs (e.g., (A, B) and (B, A))
high_correlation_pairs = high_correlation_pairs[
    high_correlation_pairs['column_1'] < high_correlation_pairs['column_2']
]

# Exclude correlation rate 1.0 or -1.0 and 'fixture_id'
filtered_high_correlation_pairs = high_correlation_pairs[
    (high_correlation_pairs['correlation'] != 1.0) &
    (high_correlation_pairs['correlation'] != -1.0) &
    (~high_correlation_pairs['column_1'].str.contains('fixture_id')) &
    (~high_correlation_pairs['column_2'].str.contains('fixture_id'))
]

# Convert to a list of tuples with the correlation rate included
correlation_list_with_rate = list(
    filtered_high_correlation_pairs[['column_1', 'column_2', 'correlation']].itertuples(index=False, name=None)
)

# Display the list
print("High correlation pairs (>|0.75|) without rate 1.0 or -1.0 and excluding 'fixture_id':")
for pair in correlation_list_with_rate:
    print(f"{pair[0]} and {pair[1]} - Correlation: {pair[2]:.2f}")

High correlation pairs (>|0.75|) without rate 1.0 or -1.0 and excluding 'fixture_id':
home_team_id and league_id - Correlation: 0.82
away_team_id and league_id - Correlation: 0.82
shots_insidebox_home and total_shots_home - Correlation: 0.86
ball_possession_home and total_passes_home - Correlation: 0.88
ball_possession_home and passes_accurate_home - Correlation: 0.86
ball_possession_home and total_passes_away - Correlation: -0.88
ball_possession_home and passes_accurate_away - Correlation: -0.86
goalkeeper_saves_home and shots_on_goal_away - Correlation: 0.85
passes_accurate_home and total_passes_home - Correlation: 0.99
passes_accurate_home and passes_percentage_home - Correlation: 0.86
passes_percentage_home and total_passes_home - Correlation: 0.79
shots_insidebox_away and total_shots_away - Correlation: 0.86
ball_possession_away and ball_possession_home - Correlation: -1.00
ball_possession_away and total_passes_home - Correlation: -0.88
ball_possession_away and passes_accurate_hom