In [1]:
import pandas as pd
import numpy as np
import os
import country_converter as coco
from typing import Optional, Tuple
import logging
import pycountry
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import warnings
from babel.languages import get_territory_language_info

# Suppress only FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

coco_logger = coco.logging.getLogger()
coco_logger.setLevel(logging.WARNING)

# Using geopy to get geographical coordinates
geolocator = Nominatim(user_agent="MyPersonalGeocoder/1.0 (myemail@example.com)", timeout=10)

# Dictionary mapping file names to their respective processing functions
files = {
  "NBA": ["NBA_foreign.csv"],
  "NFL": ["NFL_all.csv", "NFL_measure.csv", "NFL_births.csv"],
  "MLB": ["MLB_B_foreign.csv", "MLB_P_foreign.csv"],
  "NHL": ["NHL_foreign.csv"],
  "MLS": ["MLS_foreign.csv"],
}
league_files = {}

# Global list of final column names
final_columns = ['Player', 'Player-additional', 'League', 'Home Continent', 'Home Country', 'Home City', 'Overall Value', 'Offensive Performance', 'Defensive Performance', 'Measurables', 'Migration Difficulty']


In [2]:
def get_row_value(row, column_name: str, type_=str):
    try:
        value = row[column_name]
        if not isinstance(value, type_):
            return np.nan
    except KeyError:
        return np.nan
    except AttributeError as e:
        print(f"AttributeError accessing row[{column_name}]: {str(e)}, with row {row}")
        return np.nan
    return value

In [3]:
# Paths for the CSV files
country_df_path = 'countries.csv'
location_df_path = 'locations.csv'
distance_df_path = 'distances.csv'

# Load or initialize the country dataframe
if os.path.exists(country_df_path):
    country_df = pd.read_csv(country_df_path)
else:
    country_df = pd.DataFrame(columns=['code', 'country_name', 'continent', 'language', 'language_tier'])

# Load or initialize the location dataframe
if os.path.exists(location_df_path):
    location_df = pd.read_csv(location_df_path)
else:
    location_df = pd.DataFrame(columns=['city', 'country', 'latitude', 'longitude'])

# Load or initialize the distance dataframe
if os.path.exists(distance_df_path):
    distance_df = pd.read_csv(distance_df_path)
else:
    distance_df = pd.DataFrame(columns=['city_coords', 'sf_distance', 'ny_distance'])


In [4]:
def get_country_name(code: str):
    global country_df
    
    match = country_df[country_df['code'] == code]
    if not match.empty and not pd.isna(match['country_name'].iloc[0]):
        return match['country_name'].iloc[0]
    
    # Call coco if no data is found or if the country name is NaN
    country_name = coco.convert(names=code, to='name_short')
    if match.empty:
        new_row = pd.DataFrame({'code': [code], 'country_name': [country_name], 'continent': [pd.NA]})
        country_df = pd.concat([country_df, new_row], ignore_index=True)
    else:
        country_df.loc[country_df['code'] == code, 'country_name'] = country_name
    return country_name

def get_continent(code: str):
    global country_df
    
    match = country_df[country_df['code'] == code]
    if not match.empty and 'continent' in match.columns and not pd.isna(match['continent'].iloc[0]):
        return match['continent'].iloc[0]
    
    # Call coco if no data is found or continent is NaN
    continent = coco.convert(names=code, to='Continent_7')
    if match.empty:
        new_row = pd.DataFrame({'code': [code], 'country_name': [pd.NA], 'continent': [continent]})
        country_df = pd.concat([country_df, new_row], ignore_index=True)
    else:
        country_df.loc[country_df['code'] == code, 'continent'] = continent
    return continent

def lookup_location(city: str, country: str) -> Tuple[float, float]:
    global location_df

    match = location_df[(location_df['city'] == city) & (location_df['country'] == country)]
    if not match.empty:
        return match[['latitude', 'longitude']].iloc[0].tolist()
    
    try:
        location = geolocator.geocode(f"{city}, {country}")
        if location:
            new_row = pd.DataFrame({'city': [city], 'country': [country], 'latitude': [location.latitude], 'longitude': [location.longitude]})
            location_df = pd.concat([location_df, new_row], ignore_index=True)
            return (location.latitude, location.longitude)
    except (GeocoderTimedOut, GeocoderUnavailable) as e:
        print(f"Geocoding error due to timeout or unavailability: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


def get_sf_distance(city_coords: Tuple[float, float], san_francisco_coords: Tuple[float, float] = (37.7749, -122.4194)):
    global distance_df

    # Convert city coordinates to a string format for easy comparison
    city_coords_str = f"{city_coords[0]},{city_coords[1]}"

    # Check if the distance already exists in the dataframe
    match = distance_df[distance_df['city_coords'] == city_coords_str]
    if not match.empty and not pd.isna(match['sf_distance'].iloc[0]):
        return match['sf_distance'].iloc[0]
    
    # Calculate distance if no match is found
    distance = geodesic(city_coords, san_francisco_coords).kilometers
    if match.empty:
        # Add new row if no entry exists for these city coordinates
        new_row = pd.DataFrame({'city_coords': [city_coords_str], 'sf_distance': [distance], 'ny_distance': [pd.NA]})
        distance_df = pd.concat([distance_df, new_row], ignore_index=True)
    else:
        # Update existing row with SF distance if it was NaN
        distance_df.loc[distance_df['city_coords'] == city_coords_str, 'sf_distance'] = distance
    return distance

def get_ny_distance(city_coords: Tuple[float, float], new_york_coords: Tuple[float, float] = (40.7128, -74.0060)):
    global distance_df

    city_coords_str = f"{city_coords[0]},{city_coords[1]}"

    match = distance_df[distance_df['city_coords'] == city_coords_str]
    if not match.empty and not pd.isna(match['ny_distance'].iloc[0]):
        return match['ny_distance'].iloc[0]
    
    # Calculate distance if no match is found
    distance = geodesic(city_coords, new_york_coords).kilometers
    if match.empty:
        # Add new row if no entry exists for these city coordinates
        new_row = pd.DataFrame({'city_coords': [city_coords_str], 'sf_distance': [pd.NA], 'ny_distance': [distance]})
        distance_df = pd.concat([distance_df, new_row], ignore_index=True)
    else:
        # Update existing row with NY distance if it was NaN
        distance_df.loc[distance_df['city_coords'] == city_coords_str, 'ny_distance'] = distance
    return distance

def get_distance(city, country):
    city_coords = lookup_location(city, country)
    if city_coords:
        # Earth circumference assumptions
        max_distance = 20037.5  # Half the Earth's circumference in km

        distance_to_sf = get_sf_distance(city_coords)
        distance_to_ny = get_ny_distance(city_coords)
        distance = min(distance_to_sf, distance_to_ny)
        distance = distance / max_distance # normalize to [0, 1]
    else:
        distance = 0

    return distance

In [5]:
# language_tiers = {
#     "English": 0,
#     "Afrikaans": 1, "Danish": 1, "Dutch": 1, "French": 1, "Italian": 1, 
#     "Norwegian": 1, "Portuguese": 1, "Romanian": 1, "Spanish": 1, "Swedish": 1,
#     "German": 2,
#     "Indonesian": 3, "Malaysian": 3, "Swahili": 3,
#     "Albanian": 4, "Amharic": 4, "Armenian": 4, "Azerbaijani": 4, "Bengali": 4, 
#     "Bosnian": 4, "Bulgarian": 4, "Burmese": 4, "Croatian": 4, "Czech": 4,
#     "Estonian": 4, "Finnish": 4, "Georgian": 4, "Greek": 4, "Hebrew": 4, 
#     "Hindi": 4, "Hungarian": 4, "Icelandic": 4, "Khmer": 4, "Lao": 4, 
#     "Latvian": 4, "Lithuanian": 4, "Macedonian": 4, "Mongolian": 4, 
#     "Nepali": 4, "Pashto": 4, "Persian": 4, "Polish": 4, "Russian": 4, 
#     "Serbian": 4, "Sinhala": 4, "Slovak": 4, "Slovenian": 4, "Tagalog": 4, 
#     "Thai": 4, "Turkish": 4, "Ukrainian": 4, "Urdu": 4, "Uzbek": 4, 
#     "Vietnamese": 4, "Xhosa": 4, "Zulu": 4,
#     "Arabic": 5, "Cantonese": 5, "Mandarin": 5, "Japanese": 5, "Korean": 5
# }

language_tiers = {
    "en": 0,  # English
    "af": 1,  # Afrikaans
    "da": 1,  # Danish
    "nl": 1,  # Dutch
    "fr": 1,  # French
    "it": 1,  # Italian
    "no": 1,  # Norwegian
    "pt": 1,  # Portuguese
    "ro": 1,  # Romanian
    "es": 1,  # Spanish
    "sv": 1,  # Swedish
    "de": 2,  # German
    "id": 3,  # Indonesian
    "ms": 3,  # Malaysian
    "sw": 3,  # Swahili
    "sq": 4,  # Albanian
    "am": 4,  # Amharic
    "hy": 4,  # Armenian
    "az": 4,  # Azerbaijani
    "bn": 4,  # Bengali
    "bs": 4,  # Bosnian
    "bg": 4,  # Bulgarian
    "my": 4,  # Burmese
    "hr": 4,  # Croatian
    "cs": 4,  # Czech
    "et": 4,  # Estonian
    "fi": 4,  # Finnish
    "ka": 4,  # Georgian
    "el": 4,  # Greek
    "he": 4,  # Hebrew
    "hi": 4,  # Hindi
    "hu": 4,  # Hungarian
    "is": 4,  # Icelandic
    "km": 4,  # Khmer
    "lo": 4,  # Lao
    "lv": 4,  # Latvian
    "lt": 4,  # Lithuanian
    "mk": 4,  # Macedonian
    "mn": 4,  # Mongolian
    "ne": 4,  # Nepali
    "ps": 4,  # Pashto
    "fa": 4,  # Persian
    "pl": 4,  # Polish
    "ru": 4,  # Russian
    "sr": 4,  # Serbian
    "si": 4,  # Sinhala
    "sk": 4,  # Slovak
    "sl": 4,  # Slovenian
    "tl": 4,  # Tagalog (Filipino)
    "th": 4,  # Thai
    "tr": 4,  # Turkish
    "uk": 4,  # Ukrainian
    "ur": 4,  # Urdu
    "uz": 4,  # Uzbek
    "vi": 4,  # Vietnamese
    "xh": 4,  # Xhosa
    "zu": 4,  # Zulu
    "ar": 5,  # Arabic
    "yue": 5, # Cantonese
    "zh": 5,  # Mandarin
    "ja": 5,  # Japanese
    "ko": 5   # Korean
}

def get_lowest_tier_language(code: str):
    global country_df
    
    # Convert the provided code to ISO alpha-2 for consistency in DataFrame
    alpha_2_code = coco.convert(names=code, to='ISO2')
    
    # First, check if the data already exists in the DataFrame
    match = country_df[country_df['code'] == code]
    if not match.empty and not pd.isna(match['language'].iloc[0]) and not pd.isna(match['language_tier'].iloc[0]):
        return match['language'].iloc[0], match['language_tier'].iloc[0]

    # Fetch all languages spoken in the territory along with their details
    languages_info = get_territory_language_info(alpha_2_code)
    
    # Filter to include only official languages with significant population percentage
    official_languages = {
        lang: info for lang, info in languages_info.items()
        if info.get('official_status') and info.get('population_percent', 0) >= 30
    }

    if not official_languages:
        return None, None

    # Assuming language_tiers dictionary maps language codes to their respective tiers
    default_tier = 5
    lowest_tier_language = None
    lowest_tier = default_tier

    # Determine the language with the lowest tier
    for language, _ in official_languages.items():
        language_tier = language_tiers.get(language, default_tier)
        if language_tier <= lowest_tier:
            lowest_tier = language_tier
            lowest_tier_language = language

    # Update the DataFrame with new data if not found
    if match.empty:
        new_row = pd.DataFrame({
            'code': [code], 
            'language': [lowest_tier_language], 
            'language_tier': [lowest_tier]
        })
        country_df = pd.concat([country_df, new_row], ignore_index=True)
    else:
        country_df.loc[country_df['code'] == code, 'language'] = lowest_tier_language
        country_df.loc[country_df['code'] == code, 'language_tier'] = lowest_tier

    return lowest_tier_language, lowest_tier



In [6]:
def save_geo_dataframes():
    country_df.to_csv(country_df_path, index=False)
    location_df.to_csv(location_df_path, index=False)
    distance_df.to_csv(distance_df_path, index=False)


In [7]:
# Generic
def extract_player_generic(row):
    return get_row_value(row, "Player", type_=str)
    
def extract_player_additional_generic(row):
    return get_row_value(row, "Player-additional", type_=str)


# Define a dictionary for U.S. and Canadian state codes to their country ISO codes
state_to_country = {
    # U.S. state codes (abbreviated for brevity)
    "AL": "USA", "AK": "USA", "AZ": "USA", "AR": "USA", "CA": "USA",
    "CO": "USA", "CT": "USA", "DE": "USA", "FL": "USA", "GA": "USA",
    "HI": "USA", "ID": "USA", "IL": "USA", "IN": "USA", "IA": "USA",
    "KS": "USA", "KY": "USA", "LA": "USA", "ME": "USA", "MD": "USA",
    "MA": "USA", "MI": "USA", "MN": "USA", "MS": "USA", "MO": "USA",
    "MT": "USA", "NE": "USA", "NV": "USA", "NH": "USA", "NJ": "USA",
    "NM": "USA", "NY": "USA", "NC": "USA", "ND": "USA", "OH": "USA",
    "OK": "USA", "OR": "USA", "PA": "USA", "RI": "USA", "SC": "USA",
    "SD": "USA", "TN": "USA", "TX": "USA", "UT": "USA", "VT": "USA",
    "VA": "USA", "WA": "USA", "WV": "USA", "WI": "USA", "WY": "USA",
    # Canadian province codes
    "AB": "CAN", "BC": "CAN", "MB": "CAN", "NB": "CAN", "NL": "CAN",
    "NS": "CAN", "ON": "CAN", "PE": "CAN", "QC": "CAN", "SK": "CAN",
    "NT": "CAN", "NU": "CAN", "YT": "CAN",
    # Other codes
    "SUN": "RUS", "CRK": "CZE", "ENG": "GBR", "CSK": "SVK", "DDR": "DEU", "YUG": "SRB"
}

def extract_location_generic(row) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[int]]:
    """
    Extracts values for columns 'Home Continent', 'Home Country', 'Home City', "Migration Difficulty"
    """
    code = get_row_value(row, 'Birth Location', type_=str)
    if pd.isna(code) or "":
        return (np.nan, np.nan, np.nan, np.nan)

    # Split from the back and only split once
    city, code = code.rsplit(' ', 1)

    # First check against state codes, then use country_converter if needed
    code = state_to_country.get(code, code)

    # Find country names
    country = get_country_name(code)
    if country == "not_found":
        print(f"Country code {code} not found in row {row}")
        country = np.nan

    # Convert code to continent
    continent = get_continent(code)
    if continent == "not_found":
        print(f"Continent code {code} not found in row {row}")
        continent = np.nan

    language, language_tier = get_lowest_tier_language(code)
    if not language or not language_tier:
        language = np.nan
        language_tier = 0

    language_tier = language_tier / 5 # normalize to [0, 1]

    distance = get_distance(city, country)

    migration = int(language_tier * 75 + distance * 25)

    return (continent, country, city, migration)

def extract_migration_difficulty_generic(row):
    return np.nan

In [8]:
# Overall Value

# Constants for maximum values used in normalization
SCORE_WS = (-3, 26) # Win Shares (Woody Sauldsberry -2.8 1960-61) - (Kareem Abdul-Jabbar 25.4 1971-72)
SCORE_AV = (-6, 26) # Approximate Value (Dan Pastorini -6 1981) - (LaDainian Tomlinson 26 2006)
SCORE_WAR = (-6, 21) # Wins Above Replacement (Jersey Bakley -5.3 1884) - (Pud Galvin 20.5 1884)
PLUS_MINUS_MLS = (-38, 48) # Plus-Minus (Mathieu Deplagne -38 2019) - (Carlos Vela 48 2019)
SCORE_PS = (-2.4, 23) # Point Shares (Ken Baumgartner -2.4 1997-98) - (Bobby Orr 22.8 1970-71)

def extract_overall_value_nba(row):
    win_shares = get_row_value(row, 'WS', float)
    all_star_appearances = get_row_value(row, 'AS', float)

    # Convert NaN to 0
    win_shares = 0 if np.isnan(win_shares) else win_shares
    all_star_appearances = 0 if np.isnan(all_star_appearances) else all_star_appearances

    # Normalize with new range
    normalized_win_shares = (win_shares - SCORE_WS[0]) / (SCORE_WS[1] - SCORE_WS[0]) * 50
    score_all_star = all_star_appearances * 50

    total_score = int(normalized_win_shares + score_all_star)
    return total_score

def extract_overall_value_nfl(row):
    av = get_row_value(row, 'AV', float)

    # Convert NaN to 0
    av = 0 if np.isnan(av) else av

    # Normalize with new range
    normalized_av = (av - SCORE_AV[0]) / (SCORE_AV[1] - SCORE_AV[0]) * 100
    return int(normalized_av)

def extract_overall_value_mlb(row):
    war = get_row_value(row, 'WAR', float)

    # Convert NaN to 0
    war = 0 if np.isnan(war) else war

    # Normalize with new range
    normalized_war = (war - SCORE_WAR[0]) / (SCORE_WAR[1] - SCORE_WAR[0]) * 100
    return int(normalized_war)

def extract_overall_value_mls(row):
    plus_minus = get_row_value(row, '+/-', int)

    # Convert NaN to 0
    plus_minus = 0 if np.isnan(plus_minus) else plus_minus

    # Normalize with new range
    normalized_plus_minus = (plus_minus - PLUS_MINUS_MLS[0]) / (PLUS_MINUS_MLS[1] - PLUS_MINUS_MLS[0]) * 100
    return int(normalized_plus_minus)

def extract_overall_value_nhl(row):
    ps = get_row_value(row, 'PS', float)

    # Convert NaN to 0
    ps = 0 if np.isnan(ps) else ps

    # Normalize with new range
    normalized_ps = (ps - SCORE_PS[0]) / (SCORE_PS[1] - SCORE_PS[0]) * 100
    return int(normalized_ps)


In [9]:
def extract_offensive_performance_nba(row):
    return np.nan

def extract_offensive_performance_nfl(row):
    return np.nan

def extract_offensive_performance_mlb(row):
    return np.nan

def extract_offensive_performance_mls(row):
    return np.nan

def extract_offensive_performance_nhl(row):
    return np.nan


In [10]:
def extract_defensive_performance_nba(row):
    return np.nan

def extract_defensive_performance_nfl(row):
    return np.nan

def extract_defensive_performance_mlb(row):
    return np.nan

def extract_defensive_performance_mls(row):
    return np.nan

def extract_defensive_performance_nhl(row):
    return np.nan


In [11]:
def extract_measurables_nba(row):
    return np.nan

def extract_measurables_nfl(row):
    return np.nan

def extract_measurables_mlb(row):
    return np.nan

def extract_measurables_mls(row):
    return np.nan

def extract_measurables_nhl(row):
    return np.nan


In [12]:
# NBA
league = "NBA"

league_files[league] = {
    "Player": extract_player_generic,
    "Player-additional": extract_player_additional_generic,
    'League': lambda row: league,
    ('Home Continent', 'Home Country', 'Home City', "Migration Difficulty"): extract_location_generic,
    "Overall Value": extract_overall_value_nba,
    "Offensive Performance": extract_offensive_performance_nba,
    "Defensive Performance": extract_defensive_performance_nba,
    "Measurables": extract_measurables_nba,
    "Migration Difficulty": extract_migration_difficulty_generic
}


In [13]:
# NFL
league = "NFL"

league_files[league] = {
    "Player": extract_player_generic,
    "Player-additional": extract_player_additional_generic,
    'League': lambda row: league,
    ('Home Continent', 'Home Country', 'Home City', "Migration Difficulty"): extract_location_generic,
    "Overall Value": extract_overall_value_nfl,
    "Offensive Performance": extract_offensive_performance_nfl,
    "Defensive Performance": extract_defensive_performance_nfl,
    "Measurables": extract_measurables_nfl,
    "Migration Difficulty": extract_migration_difficulty_generic
}


In [14]:
# MLB
league = "MLB"

league_files[league] = {
    "Player": extract_player_generic,
    "Player-additional": extract_player_additional_generic,
    'League': lambda row: league,    
    ('Home Continent', 'Home Country', 'Home City', "Migration Difficulty"): extract_location_generic,
    "Overall Value": extract_overall_value_mlb,
    "Offensive Performance": extract_offensive_performance_mlb,
    "Defensive Performance": extract_defensive_performance_mlb,
    "Measurables": extract_measurables_mlb,
    "Migration Difficulty": extract_migration_difficulty_generic
}

In [15]:
# NHL
league = "NHL"

league_files[league] = {
    "Player": extract_player_generic,
    "Player-additional": extract_player_additional_generic,
    'League': lambda row: league,
    ('Home Continent', 'Home Country', 'Home City', "Migration Difficulty"): extract_location_generic,
    "Overall Value": extract_overall_value_nhl,
    "Offensive Performance": extract_offensive_performance_nhl,
    "Defensive Performance": extract_defensive_performance_nhl,
    "Measurables": extract_measurables_nhl,
}


In [16]:
# MLS
league = "MLS"

league_files[league] = {
    "Player": extract_player_generic,
    "Player-additional": extract_player_additional_generic,
    'League': lambda row: league,
    ('Home Continent', 'Home Country', 'Home City', "Migration Difficulty"): extract_location_generic,
    "Overall Value": extract_overall_value_mls,
    "Offensive Performance": extract_offensive_performance_mls,
    "Defensive Performance": extract_defensive_performance_mls,
    "Measurables": extract_measurables_mls,
    "Migration Difficulty": extract_migration_difficulty_generic
}


In [17]:
# Define a list of column names to update
# columns_to_update = None
# columns_to_update = ['Player']
columns_to_update = [('Home Continent', 'Home Country', 'Home City', 'Migration Difficulty')]
# columns_to_update = ['Overall Value']

In [18]:
def adaptive_merge(dataframes):
    # Start with the first DataFrame as the base for merging
    df_merged = dataframes[0]
    
    # Iterate through the remaining DataFrames
    for df in dataframes[1:]:
        # Determine the merge keys based on the 'Season' column's presence
        if 'Season' in df.columns and 'Season' in df_merged.columns:
            merge_keys = ['Season', 'Player-additional']
        else:
            merge_keys = ['Player-additional']
        
        # Perform the merge with the appropriate keys
        df_merged = pd.merge(df_merged, df, on=merge_keys, how='outer', suffixes=('', '_dup'))
        
        # Remove duplicate columns resulting from merging
        columns_to_drop = [col for col in df_merged.columns if '_dup' in col]
        for col in columns_to_drop:
            original_col = col.replace('_dup', '')
            if df_merged[original_col].isna().all() and not df_merged[col].isna().all():
                df_merged[original_col] = df_merged[col]
            df_merged.drop(columns=[col], inplace=True)
    
    return df_merged


In [19]:
# Ensure the output directory exists
output_directory = "./formatted"
os.makedirs(output_directory, exist_ok=True)

# Initialize a list to store paths to the formatted CSV files
formatted_csv_paths = []

for league, filenames in files.items():
    print(f"Processing league: {league}")
    # Load the existing formatted CSV if it exists
    output_path = os.path.join(output_directory, f"{league}_formatted.csv")
    if os.path.exists(output_path):
        df_final = pd.read_csv(output_path)
    else:
        # Initialize empty DataFrame with final columns if no file exists
        df_final = pd.DataFrame(columns=final_columns)

    dataframes = []
    for filename in filenames:
        print(f"\tReading file: {filename}")
        df = pd.read_csv(filename)
        dataframes.append(df)

    df_merged = adaptive_merge(dataframes)

    # Process the merged DataFrame
    df_temp = pd.DataFrame()
    for column_names, function in league_files[league].items():
        # Skip columns that are not in the list of columns to update
        if columns_to_update and column_names not in columns_to_update:
            continue

        print(f"\t\tSynthesizing '{column_names}'...")
        column_names = list(column_names) if isinstance(column_names, tuple) else column_names
        df_temp[column_names] = df_merged.apply(function, axis=1, result_type='expand')

    # Ensure the merged DataFrame has the columns from the processed data
    df_final[df_temp.columns] = df_temp

    # Select specific columns to save and handle missing columns gracefully
    df_final = df_final.reindex(columns=final_columns)  # Reindex to ensure all desired columns are present
    df_final = df_final.fillna(np.nan)  # Fill NA values
    df_final = df_final.infer_objects()  # Infer data types to ensure consistent type handling

    # Save to a new CSV file for the entire league
    print(f"\tSaving combined data to {output_path}...")
    save_geo_dataframes()
    df_final.to_csv(output_path, index=False)
    formatted_csv_paths.append(output_path)
    print()

# Combine all formatted CSV files into one big DataFrame and save it
combined_df = pd.concat([pd.read_csv(f) for f in formatted_csv_paths], ignore_index=True)
combined_output_path = os.path.join(output_directory, "combined_formatted.csv")
print(f"Saving all leagues combined data to {combined_output_path}...")
combined_df.to_csv(combined_output_path, index=False)


Processing league: NBA
	Reading file: NBA_foreign.csv
		Synthesizing '('Home Continent', 'Home Country', 'Home City', 'Migration Difficulty')'...
	Saving combined data to ./formatted/NBA_formatted.csv...

Processing league: NFL
	Reading file: NFL_all.csv
	Reading file: NFL_measure.csv
	Reading file: NFL_births.csv
		Synthesizing '('Home Continent', 'Home Country', 'Home City', 'Migration Difficulty')'...
	Saving combined data to ./formatted/NFL_formatted.csv...

Processing league: MLB
	Reading file: MLB_B_foreign.csv
	Reading file: MLB_P_foreign.csv
		Synthesizing '('Home Continent', 'Home Country', 'Home City', 'Migration Difficulty')'...
	Saving combined data to ./formatted/MLB_formatted.csv...

Processing league: NHL
	Reading file: NHL_foreign.csv
		Synthesizing '('Home Continent', 'Home Country', 'Home City', 'Migration Difficulty')'...
	Saving combined data to ./formatted/NHL_formatted.csv...

Processing league: MLS
	Reading file: MLS_foreign.csv
		Synthesizing '('Home Continent',