In [1]:
# set working directory to project root

import os
import pandas as pd

In [2]:
# store datalake path in variable
datalake_path = "C:\\Users\\emirh\\Github\\football-datasets\\datalake\\transfermarkt"


# set working directory to project root
os.chdir("C:\\Users\\emirh\\Github\\The-Separation-Game")


In [3]:
# load csv file
df_transfer_history = pd.read_csv(os.path.join(datalake_path, "transfer_history\\transfer_history.csv"))

df_teammates_played_with = pd.read_csv(os.path.join(datalake_path, "player_teammates_played_with\\player_teammates_played_with.csv"))


In [4]:
# get information about the dataframe
df_transfer_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1101440 entries, 0 to 1101439
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   player_id          1101440 non-null  int64 
 1   season_name        1101438 non-null  object
 2   transfer_date      1100860 non-null  object
 3   from_team_id       1101440 non-null  int64 
 4   from_team_name     1101440 non-null  object
 5   to_team_id         1101440 non-null  int64 
 6   to_team_name       1101440 non-null  object
 7   transfer_type      1101440 non-null  object
 8   value_at_transfer  1101440 non-null  int64 
 9   transfer_fee       1101440 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 84.0+ MB


In [5]:
# get information about the dataframe
df_teammates_played_with.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257342 entries, 0 to 1257341
Data columns (total 6 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   player_id                 1257342 non-null  int64  
 1   teammate_player_id        1257342 non-null  int64  
 2   teammate_player_name      1257342 non-null  object 
 3   ppg_played_with           1220262 non-null  float64
 4   joint_goal_participation  222165 non-null   float64
 5   minutes_played_with       453189 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 57.6+ MB


In [6]:
# Load datasets from local data folder
df_players = pd.read_csv("data/player_profiles.csv")
df_teammates = pd.read_csv("data/player_teammates_played_with.csv")

print(f"Loaded {len(df_players):,} players")
print(f"Loaded {len(df_teammates):,} teammate relationships")

  df_players = pd.read_csv("data/player_profiles.csv")


Loaded 92,671 players
Loaded 1,257,342 teammate relationships


In [7]:
# Create a player lookup dictionary for quick name-to-id and id-to-name conversion
player_id_to_name = df_players.set_index('player_id')['player_name'].to_dict()
player_name_to_id = df_players.set_index('player_name')['player_id'].to_dict()

# Build the teammate graph (adjacency list)
from collections import defaultdict

teammate_graph = defaultdict(set)

for _, row in df_teammates.iterrows():
    player_id = row['player_id']
    teammate_id = row['teammate_player_id']
    # Add bidirectional edges (if A played with B, then B played with A)
    teammate_graph[player_id].add(teammate_id)
    teammate_graph[teammate_id].add(player_id)

print(f"Built graph with {len(teammate_graph):,} players")

Built graph with 183,519 players


In [8]:
from collections import deque

def find_separation_path(player1_name: str, player2_name: str) -> list | None:
    """
    Find the shortest path between two players using BFS.
    Returns a list of player names representing the path, or None if no path exists.
    """
    # Search for players by partial name match (case-insensitive)
    def find_player_id(search_name: str) -> int | None:
        search_lower = search_name.lower()
        matches = df_players[df_players['player_name'].str.lower().str.contains(search_lower, na=False)]
        if len(matches) == 0:
            print(f"No player found matching '{search_name}'")
            return None
        elif len(matches) > 1:
            print(f"Multiple players found matching '{search_name}':")
            for _, row in matches.head(10).iterrows():
                print(f"  - {row['player_name']} (ID: {row['player_id']})")
            print("Using the first match...")
        return matches.iloc[0]['player_id']
    
    # Get player IDs
    start_id = find_player_id(player1_name)
    end_id = find_player_id(player2_name)
    
    if start_id is None or end_id is None:
        return None
    
    if start_id == end_id:
        return [player_id_to_name.get(start_id, str(start_id))]
    
    # BFS to find shortest path
    queue = deque([(start_id, [start_id])])
    visited = {start_id}
    
    while queue:
        current_id, path = queue.popleft()
        
        for neighbor_id in teammate_graph.get(current_id, []):
            if neighbor_id == end_id:
                # Found the target - return the path with names
                final_path = path + [end_id]
                return [player_id_to_name.get(pid, str(pid)) for pid in final_path]
            
            if neighbor_id not in visited:
                visited.add(neighbor_id)
                queue.append((neighbor_id, path + [neighbor_id]))
    
    print(f"No connection found between the players")
    return None


def display_separation_path(player1_name: str, player2_name: str):
    """Find and display the separation path between two players."""
    print(f"\nüîç Finding connection: {player1_name} ‚Üî {player2_name}")
    print("-" * 50)
    
    path = find_separation_path(player1_name, player2_name)
    
    if path:
        degrees = len(path) - 1
        print(f"\n‚úÖ Found connection with {degrees} degree(s) of separation!\n")
        
        for i, player in enumerate(path):
            if i < len(path) - 1:
                print(f"  {i+1}. {player}")
                print(f"       ‚Üì played with")
            else:
                print(f"  {i+1}. {player}")
        
        print(f"\nüìä Degrees of separation: {degrees}")
    else:
        print("\n‚ùå No connection found!")
    
    return path

In [None]:
# Example: Find the connection between two famous players
# Try with some well-known players from the dataset

display_separation_path("Bukayo Saka", "Kevin De Bruyne")


üîç Finding connection: Miroslav Klose ‚Üî Lionel Messi
--------------------------------------------------

‚úÖ Found connection with 3 degree(s) of separation!

  1. Miroslav Klose (10)
       ‚Üì played with
  2. Thomas M√ºller (58358)
       ‚Üì played with
  3. Thiago Alc√°ntara (60444)
       ‚Üì played with
  4. Lionel Messi (28003)

üìä Degrees of separation: 3


['Miroslav Klose (10)',
 'Thomas M√ºller (58358)',
 'Thiago Alc√°ntara (60444)',
 'Lionel Messi (28003)']

In [None]:
# Interactive version - enter your own players!
def separation_game():
    """Interactive separation game - enter two player names to find their connection."""
    print("=" * 60)
    print("‚öΩ THE SEPARATION GAME ‚öΩ")
    print("=" * 60)
    print("Find the shortest connection between any two football players!")
    print("Type 'quit' to exit.\n")
    
    while True:
        player1 = input("Enter first player name (or 'quit'): ").strip()
        if player1.lower() == 'quit':
            print("Thanks for playing!")
            break
            
        player2 = input("Enter second player name: ").strip()
        if player2.lower() == 'quit':
            print("Thanks for playing!")
            break
        
        display_separation_path(player1, player2)
        print("\n")

# Uncomment the line below to play interactively:
# separation_game()