In [7]:
import pandas as pd
import glob
import os # Import the os module to handle paths correctly

def list_all_csv_columns():
    """
    Reads all CSV files in the current directory and prints their column names.
    """
    # Get a list of all CSV files provided by the user
    # We'll hardcode the list based on the uploaded files
    # to ensure we only read the ones relevant to this task.
    
    # Base directory where the files are located
    base_path = "../../data/raw/"
    
    original_filenames = [
        "Player_Performance_raw.csv",
        "raw_player_context.csv",
        "raw_player_salaries.csv",
        "raw_salary_caps.csv",
        "nba_player_popularity.csv",
        "nba_stadiums.csv",
        "Owner Net Worth in Billions .csv"
    ]
    
    # Create the full file paths by joining the base path and the filename
    filenames = [os.path.join(base_path, f) for f in original_filenames]

    print("--- Listing Columns for Each Dataset ---")
    print(f"--- Looking in directory: {os.path.abspath(base_path)} ---") # Added to show the full path it's trying

    for filename in filenames:
        try:
            # Use skipinitialspace=True to handle spaces after delimiters,
            # which can mess up column names.
            df = pd.read_csv(filename, skipinitialspace=True)
            
            # Clean up column names by stripping leading/trailing whitespace
            df.columns = df.columns.str.strip()
            
            print(f"\nüìÑ File: {filename}") # This will now print the full path
            print("Columns:")
            for col in df.columns:
                print(f"  - {col}")
                
        except FileNotFoundError:
            print(f"\n‚ùå ERROR: File not found: {filename}")
        except Exception as e:
            print(f"\n‚ùå ERROR: Could not read {filename}. Reason: {e}")

if __name__ == "__main__":
    list_all_csv_columns()

--- Listing Columns for Each Dataset ---
--- Looking in directory: c:\Users\tyler\School\Learn Statistics\STA 160\Project\data\raw ---

üìÑ File: ../../data/raw/Player_Performance_raw.csv
Columns:
  - PLAYER_ID
  - PLAYER_NAME
  - TEAM_ID
  - E_OFF_RATING
  - OFF_RATING
  - sp_work_OFF_RATING
  - E_DEF_RATING
  - DEF_RATING
  - sp_work_DEF_RATING
  - E_NET_RATING
  - NET_RATING
  - sp_work_NET_RATING
  - AST_PCT
  - AST_TO
  - AST_RATIO
  - OREB_PCT
  - DREB_PCT
  - REB_PCT
  - TM_TOV_PCT
  - E_TOV_PCT
  - EFG_PCT
  - TS_PCT
  - USG_PCT
  - E_USG_PCT
  - E_PACE
  - PACE
  - PACE_PER40
  - sp_work_PACE
  - PIE
  - POSS
  - FGM_PG
  - FGA_PG
  - E_OFF_RATING_RANK
  - OFF_RATING_RANK
  - sp_work_OFF_RATING_RANK
  - E_DEF_RATING_RANK
  - DEF_RATING_RANK
  - sp_work_DEF_RATING_RANK
  - E_NET_RATING_RANK
  - NET_RATING_RANK
  - sp_work_NET_RATING_RANK
  - AST_PCT_RANK
  - AST_TO_RANK
  - AST_RATIO_RANK
  - OREB_PCT_RANK
  - DREB_PCT_RANK
  - REB_PCT_RANK
  - TM_TOV_PCT_RANK
  - E_TOV_PCT_RA

In [8]:
import pandas as pd
from pathlib import Path
import sys
import os
import re
import unicodedata

# --- File Configuration ---
BASE_PATH = '../../data/raw/' # Set the base path for all files
Path(BASE_PATH).mkdir(parents=True, exist_ok=True) # Ensure data/raw directory exists just in case

# Source files (Originals)
STATS_FILE = os.path.join(BASE_PATH, 'Player_Performance_raw.csv')
CONTEXT_FILE = os.path.join(BASE_PATH, 'raw_player_context.csv')
SALARY_FILE = os.path.join(BASE_PATH, 'raw_player_salaries.csv')
POPULARITY_FILE = os.path.join(BASE_PATH, 'nba_player_popularity.csv')

# --- NOTE: Team files will be handled in a separate step ---
# CAPS_FILE = os.path.join(BASE_PATH, 'raw_salary_caps.csv')
# OWNERS_FILE = os.path.join(BASE_PATH, 'Owner Net Worth in Billions .csv')
# STADIUMS_FILE = os.path.join(BASE_PATH, 'nba_stadiums.csv')

# Final output file
OUTPUT_FILE = 'merged_player_data.csv' # Output for this script

# --- End Configuration ---


def standardize_player_name(name: str) -> str:
    """
    Cleans and standardizes player names so merges work across data sources.
    (Copied from your cleaning_helpers.py)
    """
    if not isinstance(name, str):
        return ""
    
    # Normalize unicode (e.g., accents)
    try:
        # Handle potential empty strings or NaNs that become float
        name = str(name)
        name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode()
    except Exception as e:
        print(f"Warning: Could not normalize name '{name}'. Error: {e}")
        pass # Continue with the original name if normalization fails

    # Replace punctuation and collapse spaces
    name = re.sub(r'[^a-zA-Z\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    
    return name.strip().lower()


def load_data(filename: str, required_cols: list = None, dtype: dict = None) -> pd.DataFrame:
    """
    Loads a CSV file with robust debugging and error checking.
    """
    print(f"Attempting to load file: '{filename}'...")
    
    # 1. Check if file exists
    if not os.path.exists(filename):
        print(f"--- üî¥ ERROR: File not found! ---")
        print(f"Script stopped. Could not find file: {filename}")
        sys.exit(1) # Stop the script
        
    # 2. Load the data
    try:
        # Use skipinitialspace=True to handle spaces in column names from output
        df = pd.read_csv(filename, skipinitialspace=True, dtype=dtype)
        # Clean column names
        df.columns = df.columns.str.strip()
        print(f"‚úÖ Successfully loaded '{filename}'. Found {len(df)} rows.")
    except Exception as e:
        print(f"--- üî¥ ERROR: Could not read file! ---")
        print(f"Could not load {filename}. Error: {e}")
        sys.exit(1)

    # 3. Check for required columns
    if required_cols:
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"--- üî¥ ERROR: Missing required columns in '{filename}'! ---")
            print(f"Expected columns: {required_cols}")
            print(f"Missing columns: {missing_cols}")
            print(f"All columns found: {list(df.columns)}")
            sys.exit(1)
            
    return df

def clean_salary_player_name(name: str) -> str:
    """
    Extracts the proper 'First Last' name from the salary file's
    'Last First Last' format (e.g., "Young Trae Young").
    """
    if not isinstance(name, str):
        return ""
    
    try:
        # Split ONLY on the first space
        # "Young Trae Young" -> ["Young", "Trae Young"]
        # "Nance Jr. Larry Nance Jr." -> ["Nance", "Jr. Larry Nance Jr."]
        # "Dick Gradey Dick" -> ["Dick", "Gradey Dick"]
        parts = name.split(' ', 1)
        
        if len(parts) == 2:
            # Return the second part, which is the "Firstname Lastname"
            return parts[1]
        else:
            # If there's no space, just return the name as-is
            return name
    except Exception as e:
        print(f"Warning: Could not clean salary name '{name}'. Error: {e}")
        return name

def clean_team_id(df: pd.DataFrame, col_name='TEAM_ID') -> pd.DataFrame:
    """Converts TEAM_ID column to a standardized integer format for merging."""
    if col_name not in df.columns:
        print(f"--- üî¥ ERROR: Tried to clean '{col_name}' but column does not exist.")
        sys.exit(1)
    
    try:
        df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
        # Drop rows where TEAM_ID could not be converted (became <NA>)
        rows_before = len(df)
        df = df.dropna(subset=[col_name])
        rows_after = len(df)
        if rows_before > rows_after:
            print(f"‚ö†Ô∏è Dropped {rows_before - rows_after} rows with invalid/missing TEAM_ID.")
        
        df[col_name] = df[col_name].astype('Int64')
        return df
    
    except Exception as e:
        print(f"--- üî¥ ERROR: Could not convert '{col_name}' to a number for merging. ---")
        print(f"Error: {e}")
        sys.exit(1)
        

def main():
    """
    Main function to merge all PLAYER-related data.
    """
    print("--- Starting Player Merge Process ---")

    # --- Part 1: Load Player Data ---
    print("\n--- [Step 1] Loading Player Data ---")
    # Note: Using the column names from your output
    df_stats = load_data(STATS_FILE, ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID'])
    df_context = load_data(CONTEXT_FILE, ['PLAYER_ID', 'PLAYER_NAME', 'BIRTHDATE'])
    df_salary = load_data(SALARY_FILE, ['Player_Name', 'Salary'])
    df_popularity = load_data(POPULARITY_FILE, ['Player', 'Followers'])

    # --- Part 2: Merge Stats + Context ---
    print("\n--- [Step 2] Merging Stats and Context (on PLAYER_ID) ---")
    rows_stats = len(df_stats)
    rows_context = len(df_context)
    
    # Using 'inner' merge as requested
    df_player_base = pd.merge(
        df_stats, 
        df_context, 
        on="PLAYER_ID", 
        how="inner",
        suffixes=('_stats', '_context')
    )
    
    print(f"Stats rows: {rows_stats} | Context rows: {rows_context}")
    print(f"Merge complete. Result rows: {len(df_player_base)}")
    
    if len(df_player_base) == 0:
        print("--- üî¥ ERROR: Merge 1 (Stats + Context) resulted in 0 rows. ---")
        print("Please check PLAYER_ID columns in both files.")
        sys.exit(1)
    
    # --- Part 3: Merge Salaries ---
    print("\n--- [Step 3] Merging Salaries (on Standardized Name) ---")
    
    # Standardize names for merging
    # Note: Use the PLAYER_NAME from the context file if it exists and is cleaner
    df_player_base['merge_key'] = df_player_base['PLAYER_NAME_context'].apply(standardize_player_name)
    
    # --- THIS IS THE FIX ---
    # 1. Clean the salary name structure (e.g., "Young Trae Young" -> "Trae Young")
    # 2. Standardize the *cleaned* name (e.g., "Trae Young" -> "trae young")
    df_salary['merge_key'] = df_salary['Player_Name'].apply(clean_salary_player_name).apply(standardize_player_name)
    # --- END FIX ---
    
    # --- DEBUGGING PRINT STATEMENTS ---
    print("\n--- DEBUG: Checking standardized names for merge ---")
    print("\n--- Names from Player Base (Stats/Context): ---")
    print(df_player_base[['PLAYER_NAME_context', 'merge_key']].head(15).to_string())
    
    print("\n--- Names from Salary File: ---")
    print(df_salary[['Player_Name', 'merge_key']].head(15).to_string())
    print("--- End Debug ---")
    # --- END DEBUGGING ---
    
    rows_before_salary = len(df_player_base)
    rows_salary = len(df_salary)
    
    # Using 'inner' merge as requested in your script
    # We can change to 'left' if you want to keep players without salary info
    df_player_master = pd.merge(
        df_player_base, 
        df_salary, 
        on="merge_key", 
        how="inner" # Change to "left" to keep all players
    )
    
    print(f"Player Base rows: {rows_before_salary} | Salary rows: {rows_salary}")
    print(f"Merge complete. Result rows: {len(df_player_master)}")
    print(f"‚ö†Ô∏è Players dropped (no salary match): {rows_before_salary - len(df_player_master)}")
    
    if len(df_player_master) == 0:
        print("--- üî¥ ERROR: Merge 2 (Salaries) resulted in 0 rows. ---")
        print("Check name standardization or if salary file is correct.")
        sys.exit(1)
        
    # --- Part 4: Merge Player Popularity (NEW) ---
    print("\n--- [Step 4] Merging Player Popularity (on Standardized Name) ---")
    df_popularity['merge_key'] = df_popularity['Player'].apply(standardize_player_name)
    
    rows_before_pop = len(df_player_master)
    rows_pop = len(df_popularity)
    
    # Use LEFT merge to keep all players, even if not on popularity list
    df_player_master = pd.merge(
        df_player_master,
        df_popularity.drop(columns=['Player'], errors='ignore'), # Drop original name col
        on="merge_key",
        how="left"
    )
    
    print(f"Player Master rows: {rows_before_pop} | Popularity rows: {rows_pop}")
    print(f"Left merge complete. Result rows: {len(df_player_master)}")
    print(f"Players kept (from left merge): {len(df_player_master)}")

    # --- Part 5: Final Cleanup and Save ---
    print("\n--- [Step 5] Cleaning and Saving Player Dataset ---")
    
    # Clean up salary/cap/numeric columns
    numeric_cols_to_clean = [
        'Salary', 'Followers'
    ]
    
    for col in numeric_cols_to_clean:
        if col in df_player_master.columns:
            df_player_master[col] = (
                df_player_master[col]
                .astype(str)
                .str.replace(r"[^\d.]", "", regex=True) # Keep digits and decimals
                .replace("", None)
                .astype(float)
            )
    
    # Save to output file
    output_path = Path(OUTPUT_FILE)
    df_player_master.to_csv(output_path, index=False)
    
    print(f"\n‚úÖ --- PLAYER MERGE COMPLETE --- ‚úÖ")
    print(f"Final player dataset with {len(df_player_master)} rows saved to:")
    print(f"{output_path.resolve()}")
    
    print("\n--- NEXT STEP ---")
    print("We now need to merge the team data (Caps, Owners, Stadiums).")
    print("This will require creating a mapping between 'TEAM_ID', 'team' (abbreviation), and 'Team Name'.")


if __name__ == "__main__":
    main()

--- Starting Player Merge Process ---

--- [Step 1] Loading Player Data ---
Attempting to load file: '../../data/raw/Player_Performance_raw.csv'...
‚úÖ Successfully loaded '../../data/raw/Player_Performance_raw.csv'. Found 569 rows.
Attempting to load file: '../../data/raw/raw_player_context.csv'...
‚úÖ Successfully loaded '../../data/raw/raw_player_context.csv'. Found 569 rows.
Attempting to load file: '../../data/raw/raw_player_salaries.csv'...
‚úÖ Successfully loaded '../../data/raw/raw_player_salaries.csv'. Found 450 rows.
Attempting to load file: '../../data/raw/nba_player_popularity.csv'...
‚úÖ Successfully loaded '../../data/raw/nba_player_popularity.csv'. Found 511 rows.

--- [Step 2] Merging Stats and Context (on PLAYER_ID) ---
Stats rows: 569 | Context rows: 569
Merge complete. Result rows: 569

--- [Step 3] Merging Salaries (on Standardized Name) ---

--- DEBUG: Checking standardized names for merge ---

--- Names from Player Base (Stats/Context): ---
   PLAYER_NAME_context 

# now with team ID

In [10]:
import pandas as pd
from pathlib import Path
import sys
import os
import re
import unicodedata

# --- File Configuration ---
BASE_PATH = '../../data/raw/' # Set the base path for all files

# Player files
STATS_FILE = os.path.join(BASE_PATH, 'Player_Performance_raw.csv')
CONTEXT_FILE = os.path.join(BASE_PATH, 'raw_player_context.csv')
SALARY_FILE = os.path.join(BASE_PATH, 'raw_player_salaries.csv')
POPULARITY_FILE = os.path.join(BASE_PATH, 'nba_player_popularity.csv')

# Team files
CAPS_FILE = os.path.join(BASE_PATH, 'raw_salary_caps.csv')
STADIUMS_FILE = os.path.join(BASE_PATH, 'nba_stadiums.csv')
OWNERS_FILE = os.path.join(BASE_PATH, 'Owner Net Worth in Billions .csv')

# Final output file
OUTPUT_FILE = 'master_dataset_v3.csv'

# --- End Configuration ---


def standardize_player_name(name: str) -> str:
    """
    Cleans and standardizes player names so merges work across data sources.
    """
    if not isinstance(name, str):
        return ""
    
    # Normalize unicode (e.g., accents)
    try:
        name = str(name)
        name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode()
    except Exception as e:
        print(f"Warning: Could not normalize name '{name}'. Error: {e}")
        pass

    # Replace punctuation and collapse spaces
    name = re.sub(r'[^a-zA-Z\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    
    return name.strip().lower()


def load_data(filename: str, required_cols: list = None, dtype: dict = None) -> pd.DataFrame:
    """
    Loads a CSV file with robust debugging and error checking.
    """
    print(f"Attempting to load file: '{filename}'...")
    
    if not os.path.exists(filename):
        print(f"--- üî¥ ERROR: File not found! ---")
        print(f"Script stopped. Could not find file: {filename}")
        sys.exit(1)
        
    try:
        # Use skipinitialspace=True to handle spaces in column names
        df = pd.read_csv(filename, skipinitialspace=True, dtype=dtype)
        # Clean column names
        df.columns = df.columns.str.strip()
        print(f"‚úÖ Successfully loaded '{filename}'. Found {len(df)} rows.")
    except Exception as e:
        print(f"--- üî¥ ERROR: Could not read file! ---")
        print(f"Could not load {filename}. Error: {e}")
        sys.exit(1)

    if required_cols:
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"--- üî¥ ERROR: Missing required columns in '{filename}'! ---")
            print(f"Expected columns: {required_cols}")
            print(f"Missing columns: {missing_cols}")
            print(f"All columns found: {list(df.columns)}")
            sys.exit(1)
            
    return df

def clean_salary_player_name(name: str) -> str:
    """
    Extracts the proper 'First Last' name from the salary file's
    'Last First Last' format (e.g., "Young Trae Young").
    """
    if not isinstance(name, str):
        return ""
    
    try:
        # Split ONLY on the first space
        parts = name.split(' ', 1)
        if len(parts) == 2:
            return parts[1] # Return the "Firstname Lastname" part
        else:
            return name
    except Exception as e:
        print(f"Warning: Could not clean salary name '{name}'. Error: {e}")
        return name

def clean_team_id(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """Converts a specified TEAM_ID column to a standardized integer format."""
    if col_name not in df.columns:
        print(f"--- üî¥ ERROR: Tried to clean '{col_name}' but column does not exist.")
        print(f"All columns found: {list(df.columns)}")
        sys.exit(1)
    
    try:
        df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
        rows_before = len(df)
        df = df.dropna(subset=[col_name])
        rows_after = len(df)
        if rows_before > rows_after:
            print(f"‚ö†Ô∏è Dropped {rows_before - rows_after} rows with invalid/missing TEAM_ID in column '{col_name}'.")
        
        df[col_name] = df[col_name].astype('Int64')
        return df
    
    except Exception as e:
        print(f"--- üî¥ ERROR: Could not convert '{col_name}' to a number for merging. ---")
        print(f"Error: {e}")
        sys.exit(1)

def clean_numeric_cols(df: pd.DataFrame, cols_to_clean: list) -> pd.DataFrame:
    """
    Cleans specified columns by removing non-numeric characters
    and converting them to floats.
    """
    for col in cols_to_clean:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.replace(r"[^\d.]", "", regex=True) # Keep digits and decimals
                .replace("", None)
                .astype(float)
            )
    return df
        

def main():
    """
    Main function to run the entire merge workflow.
    """
    print("--- Starting Full Merge Process ---")

    # --- Part 1: Load Player Data ---
    print("\n--- [Step 1] Loading Player Data ---")
    df_stats = load_data(STATS_FILE, ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID'])
    df_context = load_data(CONTEXT_FILE, ['PLAYER_ID', 'PLAYER_NAME'])
    df_salary = load_data(SALARY_FILE, ['Player_Name', 'Salary'])
    df_popularity = load_data(POPULARITY_FILE, ['Player', 'Followers'])

    # --- Part 2: Build Player Master DataFrame ---
    print("\n--- [Step 2] Building Player Master List ---")
    
    # Merge Stats + Context (on PLAYER_ID)
    print("Merging Stats + Context...")
    df_player_master = pd.merge(
        df_stats, 
        df_context, 
        on="PLAYER_ID", 
        how="inner",
        suffixes=('_stats', '_context')
    )
    
    # Merge Salaries (on Standardized Name)
    print("Merging Salaries...")
    df_player_master['merge_key'] = df_player_master['PLAYER_NAME_context'].apply(standardize_player_name)
    df_salary['merge_key'] = df_salary['Player_Name'].apply(clean_salary_player_name).apply(standardize_player_name)
    
    rows_before_salary = len(df_player_master)
    df_player_master = pd.merge(
        df_player_master, 
        df_salary, 
        on="merge_key", 
        how="inner" # Keep only players with salary info
    )
    print(f"Merge complete. Result rows: {len(df_player_master)}")
    print(f"‚ö†Ô∏è Players dropped (no salary match): {rows_before_salary - len(df_player_master)}")

    # Merge Player Popularity (on Standardized Name)
    print("Merging Player Popularity...")
    df_popularity['merge_key'] = df_popularity['Player'].apply(standardize_player_name)
    
    df_player_master = pd.merge(
        df_player_master,
        df_popularity.drop(columns=['Player'], errors='ignore'),
        on="merge_key",
        how="left" # Keep all players, even if no popularity info
    )
    print(f"Player Master list complete. Total rows: {len(df_player_master)}")

    # --- Part 3: Load Team Data ---
    print("\n--- [Step 3] Loading Team Data ---")
    df_caps = load_data(CAPS_FILE, ['team_id', 'team'])
    df_owners = load_data(OWNERS_FILE, ['Team ID', 'Team Name'])
    df_stadiums = load_data(STADIUMS_FILE, ['TEAM_ABBREVIATION', 'Stadium_Name'])

    # --- Part 4: Build Team Master DataFrame ---
    print("\n--- [Step 4] Building Team Master List ---")
    
    # Standardize TEAM_ID column names and types BEFORE merging
    df_caps = df_caps.rename(columns={'team_id': 'TEAM_ID'})
    # FIX: Drop duplicates to ensure only one row per team
    df_caps = clean_team_id(df_caps, 'TEAM_ID').drop_duplicates(subset=['TEAM_ID'])
    
    df_owners = df_owners.rename(columns={'Team ID': 'TEAM_ID'})
    # FIX: Drop duplicates to ensure only one row per team
    df_owners = clean_team_id(df_owners, 'TEAM_ID').drop_duplicates(subset=['TEAM_ID'])
    
    # Drop junk column
    if 'Unnamed: 3' in df_owners.columns:
        df_owners = df_owners.drop(columns=['Unnamed: 3'])

    # Merge Caps + Owners (on TEAM_ID)
    print("Merging Caps + Owners...")
    df_team_master = pd.merge(
        df_caps,
        df_owners,
        on="TEAM_ID",
        how="inner" # Use 'inner' to keep only teams present in both
    )
    print(f"Team Base (Caps+Owners) complete. Result rows: {len(df_team_master)}")

    # Merge Stadiums (on Abbreviation)
    print("Merging Stadium Data...")
    df_team_master = pd.merge(
        df_team_master,
        df_stadiums,
        left_on="team", # Abbreviation from raw_salary_caps.csv
        right_on="TEAM_ABBREVIATION",
        how="left" # Keep all teams, even if no stadium info
    )
    print(f"Team Master list complete. Total rows: {len(df_team_master)}")

    # --- Part 5: Final Merge: Players + Teams ---
    print("\n--- [Step 5] Final Merge: Players + Teams (on TEAM_ID) ---")
    
    # FIX: Use .copy() to prevent SettingWithCopyWarning
    df_player_master_clean = clean_team_id(df_player_master.copy(), 'TEAM_ID')
    
    rows_players = len(df_player_master_clean)
    rows_teams = len(df_team_master)
    
    df_final = pd.merge(
        df_player_master_clean,
        df_team_master,
        on="TEAM_ID",
        how="left" # Keep ALL players, attach team info
    )
    
    print(f"Player Master rows: {rows_players} | Team Master rows: {rows_teams}")
    print(f"Final merge complete. Final rows: {len(df_final)}")
    
    if len(df_final) != rows_players:
        print(f"‚ö†Ô∏è Warning: Row count changed. Investigate merge. Expected {rows_players}.")

    # --- Part 6: Final Cleanup and Save ---
    print("\n--- [Step 6] Cleaning and Saving Final Dataset ---")
    
    # Rename messy 'Owner Net Worth' column
    if 'Owner Net Worth in Billions' in df_final.columns:
        df_final = df_final.rename(columns={
            'Owner Net Worth in Billions': 'Owner_Net_Worth_Billions'
        })
    
    # List of ALL numeric columns that need cleaning
    all_numeric_cols = [
        'Salary', 'Followers', 'total_cap_used', 'remaining_cap_space', 
        'active_cap', 'active_top_3', 'dead_cap', 'Capacity', 
        'Construction_Cost', 'Owner_Net_Worth_Billions'
    ]
    
    df_final = clean_numeric_cols(df_final, all_numeric_cols)
    
    # Save to output file
    output_path = Path(OUTPUT_FILE)
    df_final.to_csv(output_path, index=False)
    
    print(f"\n‚úÖ --- PROCESS COMPLETE --- ‚úÖ")
    print(f"Final dataset with {len(df_final)} rows and {len(df_final.columns)} columns saved to:")
    print(f"{output_path.resolve()}")


if __name__ == "__main__":
    main()

--- Starting Full Merge Process ---

--- [Step 1] Loading Player Data ---
Attempting to load file: '../../data/raw/Player_Performance_raw.csv'...
‚úÖ Successfully loaded '../../data/raw/Player_Performance_raw.csv'. Found 569 rows.
Attempting to load file: '../../data/raw/raw_player_context.csv'...
‚úÖ Successfully loaded '../../data/raw/raw_player_context.csv'. Found 569 rows.
Attempting to load file: '../../data/raw/raw_player_salaries.csv'...
‚úÖ Successfully loaded '../../data/raw/raw_player_salaries.csv'. Found 450 rows.
Attempting to load file: '../../data/raw/nba_player_popularity.csv'...
‚úÖ Successfully loaded '../../data/raw/nba_player_popularity.csv'. Found 511 rows.

--- [Step 2] Building Player Master List ---
Merging Stats + Context...
Merging Salaries...
Merge complete. Result rows: 409
‚ö†Ô∏è Players dropped (no salary match): 160
Merging Player Popularity...
Player Master list complete. Total rows: 409

--- [Step 3] Loading Team Data ---
Attempting to load file: '../../

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].astype('Int64')
