In [100]:
# rerun if needed
# %pip install pandas matplotlib seaborn

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
data = pd.read_csv('merged_events.csv')
# all numeric columns
numeric_data = data.select_dtypes(include='number')
print(data.columns.tolist())

# check why the column of the event_time is mixed types
# for col in data.columns:
#     print(f"The type of column {col} is {data[col].dtype}")
#     
#     if col != "event_time":
#         continue
#     
#     for idx, row in enumerate(data[col]):
#         if not isinstance(row, int):
#             print(f"The type of row {row} (at index {idx}) is {type(row)}")

        

['fixture_id', 'event_time', 'team_id', 'event_type', 'detailed_type', 'main_player_id', 'secondary_player_id']


  data = pd.read_csv('merged_events.csv')


In [101]:
# make all the data in column event_time to be string
# Convert all values in the "event_time" column to strings
data['event_time'] = data['event_time'].astype(str)

In [102]:
# Count the number of missing values in each column
missing_values = data.isnull().sum()
# Filter and print only the columns with missing values greater than zero
non_zero_missing = missing_values[missing_values > 0]
print(non_zero_missing)
# check how much rows in the db
# num_rows = data.count()
# print(num_rows)

detailed_type               7
main_player_id          74065
secondary_player_id    162800
dtype: int64


In [103]:
# check that all the main player id missing is from types which not include players
# Create a set to store `detailed_type` values where `main_player_id` is empty
detailed_type_set = set()

# Iterate through the rows where `main_player_id` is NaN or empty
for idx, row in data.iterrows():
    if pd.isna(row['main_player_id']) or row['main_player_id'] == '':
        detailed_type_set.add(row['event_type'])

# Display the resulting set
print(detailed_type_set)

# Filter rows where `main_player_id` is missing
missing_main_player = data[data['main_player_id'].isna() | (data['main_player_id'] == '')]

# Count the occurrences of each `detailed_type` for missing `main_player_id`
missing_counts = missing_main_player['event_type'].value_counts()

# Print the results
for event_type, count in missing_counts.items():
    print(f"Missing values for event_type '{event_type}': {count}")


# after check randomly - the data is missing from the api 
# FormationSet,Possession,FormationChange is ok that we have missing values - it's not related to specific player
# card missing values we will drop
# subst and goal - it is importat to know that the event happe, but not who did it (at least for now) we will fill the main player as generic player with id 0



{'FormationSet', 'Possession', 'FormationChange', 'Goal', 'Card', 'subst'}
Missing values for event_type 'Possession': 63990
Missing values for event_type 'FormationChange': 6973
Missing values for event_type 'FormationSet': 2998
Missing values for event_type 'Card': 92
Missing values for event_type 'subst': 11
Missing values for event_type 'Goal': 1


In [104]:
# Create a copy of the original data to work on
updated_data = data.copy()

# Filter rows where `main_player_id` is missing
missing_main_player = updated_data[updated_data['main_player_id'].isna() | (updated_data['main_player_id'] == '')]

# Count the missing occurrences for each `event_type`
missing_counts = missing_main_player['event_type'].value_counts()
print("Missing value counts by `event_type`:")
print(missing_counts)

# Process the missing values based on your logic
for idx, row in updated_data.iterrows():
    if pd.isna(row['main_player_id']) or row['main_player_id'] == '':
        event_type = row['event_type']

        if event_type in {"FormationSet", "Possession", "FormationChange"}:
            # It's okay to have missing values, so do nothing
            continue

        elif event_type == "Card":
            # Drop rows with `event_type` as "card"
            updated_data.drop(index=idx, inplace=True)

        elif event_type in {"subst", "Goal"}:
            # Replace missing `main_player_id` with a generic player ID (0)
            updated_data.at[idx, 'main_player_id'] = 0

# Reset the index of the updated DataFrame after dropping rows
updated_data.reset_index(drop=True, inplace=True)

# Verify the results
print("Processing complete. Updated DataFrame:")
print(updated_data.head())

# Filter rows where `main_player_id` is missing
missing_main_player = updated_data[updated_data['main_player_id'].isna() | (updated_data['main_player_id'] == '')]

# Count the missing occurrences for each `event_type`
missing_counts = missing_main_player['event_type'].value_counts()
print("Missing main player value counts by `event_type`:")
print(missing_counts)


Missing value counts by `event_type`:
event_type
Possession         63990
FormationChange     6973
FormationSet        2998
Card                  92
subst                 11
Goal                   1
Name: count, dtype: int64
Processing complete. Updated DataFrame:
   fixture_id event_time  team_id event_type   detailed_type  main_player_id  \
0     1208127         20       40       Goal     Normal Goal         51617.0   
1     1208127         25       40      subst  Substitution 1        180317.0   
2     1208127         34       66       Card     Yellow Card         19170.0   
3     1208127         45       66      subst  Substitution 1         19191.0   
4     1208127         45       66       Card     Yellow Card          2926.0   

   secondary_player_id  
0                306.0  
1                283.0  
2                  NaN  
3              19192.0  
4                  NaN  
Missing main player value counts by `event_type`:
event_type
Possession         63990
FormationChange   

In [105]:
# Function to process the `event_time` column
def process_event_time(value):
    try:
        if value.isdigit():
            # If the value is a number as a string, convert it to an integer
            return int(value)
        # Check if the value is a negative number as a string
        elif value.lstrip('-').isdigit():
            return int(value)
        elif '+' in value:
            base, extra = value.split('+')
            if base == "90":
                # For "90+something", return the integer sum
                return int(base) + int(extra)
            elif base == "45":
                # For "45+something", return a float "45.something"
                return float(f"{base}.{extra}")
        else:
            raise ValueError(f"Unexpected format: {value}")
    except Exception as e:
        print(f"Error processing value '{value}': {e}")
        return None  # Return None for invalid cases

# Apply the function to the `event_time` column in the updated_data DataFrame
updated_data['event_time'] = updated_data['event_time'].apply(process_event_time)

# Verify the results
print("Processed DataFrame (updated_data):")
print(updated_data.head())


# Count the number of non-integer values in the event_time column
non_float_count = updated_data['event_time'].apply(lambda x: not isinstance(x, float)).sum()

print(f"Number of non-float values in the 'event_time' column: {non_float_count}")



Processed DataFrame (updated_data):
   fixture_id  event_time  team_id event_type   detailed_type  main_player_id  \
0     1208127        20.0       40       Goal     Normal Goal         51617.0   
1     1208127        25.0       40      subst  Substitution 1        180317.0   
2     1208127        34.0       66       Card     Yellow Card         19170.0   
3     1208127        45.0       66      subst  Substitution 1         19191.0   
4     1208127        45.0       66       Card     Yellow Card          2926.0   

   secondary_player_id  
0                306.0  
1                283.0  
2                  NaN  
3              19192.0  
4                  NaN  
Number of non-float values in the 'event_time' column: 0


In [106]:
# Create a dictionary to store counts of event types with `event_time < 0`
negative_event_time_dict = {}

# Iterate over rows where `event_time` is less than 0
for idx, row in updated_data.iterrows():
    if row['event_time'] < 0:
        event_type = row['event_type']
        # Increment the count for the event type in the dictionary
        if event_type in negative_event_time_dict:
            negative_event_time_dict[event_type] += 1
        else:
            negative_event_time_dict[event_type] = 1

# Print the dictionary
print(negative_event_time_dict)
print("Counts of rows with `event_time < 0` by `event_type`:")
for event_type, count in negative_event_time_dict.items():
    print(f"{event_type}: {count}")


# we will drop the rows if the type is card else we will check based on the number of problematic values
# Count the number of rows that match the condition
rows_to_delete = updated_data[(updated_data['event_time'] < 0) & (updated_data['event_type'] == "Card")].shape[0]

# Drop rows where `event_time` < 0 and `event_type` is "Card"
updated_data = updated_data[~((updated_data['event_time'] < 0) & (updated_data['event_type'] == "Card"))]

# Reset the index of the DataFrame after dropping rows
updated_data.reset_index(drop=True, inplace=True)

# Print the number of rows deleted
print(f"Number of rows deleted: {rows_to_delete}")

# check if it indeed delete
# Create a dictionary to store counts of event types with `event_time < 0`
negative_event_time_dict = {}

# Iterate over rows where `event_time` is less than 0
for idx, row in updated_data.iterrows():
    if row['event_time'] < 0:
        event_type = row['event_type']
        # Increment the count for the event type in the dictionary
        if event_type in negative_event_time_dict:
            negative_event_time_dict[event_type] += 1
        else:
            negative_event_time_dict[event_type] = 1

# Print the dictionary
print(negative_event_time_dict)
print("Counts of rows with `event_time < 0` by `event_type`:")
for event_type, count in negative_event_time_dict.items():
    print(f"{event_type}: {count}")

{'Card': 319}
Counts of rows with `event_time < 0` by `event_type`:
Card: 319
Number of rows deleted: 319
{}
Counts of rows with `event_time < 0` by `event_type`:


In [107]:
# every missing secondary player we will fill with the "default" player - 0
# Fill NaN values in the 'secondary_player' column with 0
updated_data['secondary_player_id'] = updated_data['secondary_player_id'].fillna(0)

# Verify the changes
print("Updated DataFrame with 'secondary_player_id' NaN values filled:")
print(updated_data.head())

Updated DataFrame with 'secondary_player_id' NaN values filled:
   fixture_id  event_time  team_id event_type   detailed_type  main_player_id  \
0     1208127        20.0       40       Goal     Normal Goal         51617.0   
1     1208127        25.0       40      subst  Substitution 1        180317.0   
2     1208127        34.0       66       Card     Yellow Card         19170.0   
3     1208127        45.0       66      subst  Substitution 1         19191.0   
4     1208127        45.0       66       Card     Yellow Card          2926.0   

   secondary_player_id  
0                306.0  
1                283.0  
2                  0.0  
3              19192.0  
4                  0.0  


In [108]:
# Step 1: Create a new column for possession_detailed_type
updated_data['possession_detailed_type'] = updated_data.apply(
    lambda row: row['detailed_type'] if row['event_type'] == 'Possession' else 0, axis=1
)

# Step 2: Update the `detailed_type` column to 0 where `event_type` is `Possession`
updated_data['detailed_type'] = updated_data.apply(
    lambda row: 0 if row['event_type'] == 'Possession' else row['detailed_type'], axis=1
)

# Display the updated DataFrame
print(updated_data.head())

   fixture_id  event_time  team_id event_type   detailed_type  main_player_id  \
0     1208127        20.0       40       Goal     Normal Goal         51617.0   
1     1208127        25.0       40      subst  Substitution 1        180317.0   
2     1208127        34.0       66       Card     Yellow Card         19170.0   
3     1208127        45.0       66      subst  Substitution 1         19191.0   
4     1208127        45.0       66       Card     Yellow Card          2926.0   

   secondary_player_id possession_detailed_type  
0                306.0                        0  
1                283.0                        0  
2                  0.0                        0  
3              19192.0                        0  
4                  0.0                        0  


In [109]:
# do one hot encoding to the event type columns
# Step 1: Perform One-Hot Encoding on the `event_type` column
event_type_encoded = pd.get_dummies(updated_data['event_type'], prefix='event_type')
# Step 2: Merge the one-hot encoded columns back to the original DataFrame
updated_data = pd.concat([updated_data, event_type_encoded], axis=1)
updated_data.drop(columns=['event_type'], inplace=True)
# convert in into int and not true/false
dummy_cols = [col for col in updated_data.columns if any(prefix in col for prefix in ['event_type_'])]
updated_data[dummy_cols] = updated_data[dummy_cols].astype(int)

# Display the updated DataFrame
print(updated_data.head())

   fixture_id  event_time  team_id   detailed_type  main_player_id  \
0     1208127        20.0       40     Normal Goal         51617.0   
1     1208127        25.0       40  Substitution 1        180317.0   
2     1208127        34.0       66     Yellow Card         19170.0   
3     1208127        45.0       66  Substitution 1         19191.0   
4     1208127        45.0       66     Yellow Card          2926.0   

   secondary_player_id possession_detailed_type  event_type_Card  \
0                306.0                        0                0   
1                283.0                        0                0   
2                  0.0                        0                1   
3              19192.0                        0                0   
4                  0.0                        0                1   

   event_type_FormationChange  event_type_FormationSet  event_type_Goal  \
0                           0                        0                1   
1                   

In [110]:
# do one hot encoding to the event type columns
# Step 1: Perform One-Hot Encoding on the `event_type` column
detailed_type_encoded = pd.get_dummies(updated_data['detailed_type'], prefix='detailed_type')
# Step 2: Merge the one-hot encoded columns back to the original DataFrame
updated_data = pd.concat([updated_data, detailed_type_encoded], axis=1)
updated_data.drop(columns=['detailed_type'], inplace=True)
# convert in into int and not true/false
dummy_cols = [col for col in updated_data.columns if any(prefix in col for prefix in ['detailed_type_'])]
updated_data[dummy_cols] = updated_data[dummy_cols].astype(int)

# Display the updated DataFrame
print(updated_data.head())

   fixture_id  event_time  team_id  main_player_id  secondary_player_id  \
0     1208127        20.0       40         51617.0                306.0   
1     1208127        25.0       40        180317.0                283.0   
2     1208127        34.0       66         19170.0                  0.0   
3     1208127        45.0       66         19191.0              19192.0   
4     1208127        45.0       66          2926.0                  0.0   

  possession_detailed_type  event_type_Card  event_type_FormationChange  \
0                        0                0                           0   
1                        0                0                           0   
2                        0                1                           0   
3                        0                0                           0   
4                        0                1                           0   

   event_type_FormationSet  event_type_Goal  ...  \
0                        0                1  .

In [112]:
# Identify columns with non-numeric values
non_numeric_columns = []

for col in updated_data.columns:
    if not pd.api.types.is_numeric_dtype(updated_data[col]):
        non_numeric_columns.append(col)

# Print the columns with non-numeric values
print("Columns with non-numeric values:")
for col in non_numeric_columns:
    print(col)


Columns with non-numeric values:
possession_detailed_type
event_type_FormationChange
event_type_FormationSet


In [113]:
# Identify and print non-numeric values in the `possession_detailed_type` column
non_numeric_values = updated_data['possession_detailed_type'][~updated_data['possession_detailed_type'].apply(lambda x: isinstance(x, (int, float)))]

print("Non-numeric values in `possession_detailed_type` column:")
print(non_numeric_values)


Non-numeric values in `possession_detailed_type` column:
228883    58.4
228884    41.6
228885    58.3
228886    41.7
228887    56.7
          ... 
302836    22.0
302837    81.0
302838    19.0
302839    83.8
302840    16.3
Name: possession_detailed_type, Length: 63990, dtype: object


In [114]:
# Function to convert values to float
def to_float(value):
    try:
        # Attempt to convert the value to float
        return float(value)
    except (ValueError, TypeError):
        # Handle non-numeric values gracefully
        return None  # Or you can set a default value like 0.0

# Apply the function to the `possession_detailed_type` column
updated_data['possession_detailed_type'] = updated_data['possession_detailed_type'].apply(to_float)

# Print the updated column to verify
print("Updated `possession_detailed_type` column:")
print(updated_data['possession_detailed_type'])


Updated `possession_detailed_type` column:
0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
302836    22.0
302837    81.0
302838    19.0
302839    83.8
302840    16.3
Name: possession_detailed_type, Length: 302841, dtype: float64


In [115]:
print(updated_data.columns.to_list())

['fixture_id', 'event_time', 'team_id', 'main_player_id', 'secondary_player_id', 'possession_detailed_type', 'event_type_Card', 'event_type_FormationChange', 'event_type_FormationSet', 'event_type_Goal', 'event_type_Possession', 'event_type_Var', 'event_type_subst', 'detailed_type_0', 'detailed_type_18', 'detailed_type_19', 'detailed_type_20', 'detailed_type_21', 'detailed_type_22', 'detailed_type_24', 'detailed_type_25', 'detailed_type_3', 'detailed_type_3-4-2-1', 'detailed_type_3-4-3', 'detailed_type_3-5-1-1', 'detailed_type_3-5-2', 'detailed_type_4-1-4-1', 'detailed_type_4-2-2-2', 'detailed_type_4-2-3-1', 'detailed_type_4-3-1-2', 'detailed_type_4-3-3', 'detailed_type_4-4-1-1', 'detailed_type_4-4-2', 'detailed_type_5', 'detailed_type_5-3-2', 'detailed_type_5-4-1', 'detailed_type_9', 'detailed_type_Card reviewed', 'detailed_type_Card upgrade', 'detailed_type_Goal Disallowed - Foul', 'detailed_type_Goal Disallowed - handball', 'detailed_type_Goal Disallowed - offside', 'detailed_type_G