## This Notebook Cleans our Data

### This is quite a lot of data so it may take some time to complete depending on hardware resources

### I would recommend only having this notbook open and closing all other applications when running, unless you have good specs on desktop

#### Feel free to change the values to whatever you want if you are experementing with removing other data, like _std, _5_last_games, etc.


In [None]:
import numpy as np
import pandas as pd
import os
import gc

In [None]:
# Function to load and clean data
def load_and_clean_data(file_path):
    """Load a dataset and apply cleaning steps.
    Removes STD, season average, and last 5 matched sum
    Fills NULL/nan values with 0/zero
    and drops rows with more than 20% NULL values we cna change this 
    I just felt like 20% seemed right"""
    try:
        df = pd.read_csv(file_path)
        df = df.drop(df.filter(regex='_std$|_season_average$|5_last_match_sum$').columns, axis=1)
        
        # Drop rows where more than 80% of the data is missing
        threshold = int(0.20 * len(df.columns))  # Keep rows with at least 80% non-null values
        df = df.dropna(thresh=threshold)

        # Fill remaining missing values with zero
        df = df.fillna(0)
        return df
    except FileNotFoundError as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [None]:
# Function to reshape player data (convert multiple rows per ID into a single row)
def reshape_player_data(player_df):
    """Convert player stats from multiple rows to a single row per ID."""
    if player_df is None:
        return None

    # Add a unique number to each player's stats for a given ID
    player_df["player_number"] = player_df.groupby("ID").cumcount() + 1

    # Reshape using pivot_table (each player gets a numbered column)
    player_df = player_df.pivot(index="ID", columns="player_number")
    
    # Flatten MultiIndex columns
    player_df.columns = [f"{col[0]}_P{col[1]}" for col in player_df.columns]
    
    # Reset index so ID is a column again
    player_df = player_df.reset_index()
    
    return player_df

In [None]:
# Function to load and clean data
def clean_data_V2(df):
    """Load a dataset and apply cleaning steps.
    remove P15-27
    and POSITION"""
    try:
        df = df.drop(df.filter(regex='P2[0-7]$|^POSITION|P1[5-9]$|^TEAM_NAME|^LEAGUE|^PLAYER_NAME').columns, axis=1)
        return df
    except FileNotFoundError as e:
        print(f"Error loading {df}: {e}")
        return None

In [None]:
# Function to merge team and reshaped player data
def merge_team_and_players(team_path, player_path, output_path):
    """Merge a team dataset with its players into a single row per ID and save to file."""
    print(f"Processing and saving: {output_path}")

    # Load and clean data
    team_df = load_and_clean_data(team_path)
    player_df = load_and_clean_data(player_path)

    if team_df is None or player_df is None:
        print(f"Skipping {output_path} due to missing data.")
        return

    # Reshape player stats
    reshaped_players = reshape_player_data(player_df)

    # Merge team stats with reshaped player stats (1 row per ID)
    merged_df = pd.merge(team_df, reshaped_players, on='ID', how='left')
    
    cleaned_v2 = clean_data_V2(merged_df)

    # Save final dataset
    cleaned_v2.to_csv(output_path, index=False)
    print(f"Saved {output_path} ({cleaned_v2.shape[0]} rows, {cleaned_v2.shape[1]} columns)")

    # Free up memory
    del team_df, player_df, reshaped_players, merged_df, cleaned_v2
    gc.collect()

In [None]:
# Define dataset paths
data_paths = {
    "train_home_team": "C:/Path/To/Data/Train_Data/train_home_team_statistics_df.csv",
    "train_home_player": "C:/Path/To/Data/Train_Data/train_home_player_statistics_df.csv",
    "train_away_team": "C:/Path/To/Data/Train_Data/train_away_team_statistics_df.csv",
    "train_away_player": "C:/Path/To/Data/Train_Data/train_away_player_statistics_df.csv",
    "test_home_team": "C:/Path/To/Data/Test_Data/test_home_team_statistics_df.csv",
    "test_home_player": "C:/Path/To/Data/Test_Data/test_home_player_statistics_df.csv",
    "test_away_team": "C:/Path/To/Data/Test_Data/test_away_team_statistics_df.csv",
    "test_away_player": "C:/Path/To/Data/Test_Data/test_away_player_statistics_df.csv"
}

In [None]:
# CLeaning Columns
# Create output directory
output_dir = "C:/Path/To/Data/cleanedData/TestingCleanedDataGroupTest"
os.makedirs(output_dir, exist_ok=True)

# Merge and save Home datasets separately
merge_team_and_players(data_paths["train_home_team"], data_paths["train_home_player"], os.path.join(output_dir, "train_merged_home.csv"))
merge_team_and_players(data_paths["test_home_team"], data_paths["test_home_player"], os.path.join(output_dir, "test_merged_home.csv"))

# Merge and save Away datasets separately
merge_team_and_players(data_paths["train_away_team"], data_paths["train_away_player"], os.path.join(output_dir, "train_merged_away.csv"))
merge_team_and_players(data_paths["test_away_team"], data_paths["test_away_player"], os.path.join(output_dir, "test_merged_away.csv"))

print("All datasets merged and saved successfully!")