In [12]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob

# Load the data (excluding doubles files and files before 2005)
data_path = Path("../data/tennis_atp-master")
match_files = sorted(glob.glob(str(data_path / "atp_matches_*.csv")))

# Exclude doubles files
match_files = [f for f in match_files if 'doubles' not in Path(f).name.lower()]

# Exclude files from before 2005 based on filename
# Files are named like atp_matches_2005.csv, atp_matches_2004.csv, etc.
filtered_files = []
for f in match_files:
    filename = Path(f).name
    # Extract year from filename (e.g., "atp_matches_2005.csv" -> "2005")
    # Handle special cases like "atp_matches_amateur.csv"
    if filename.startswith('atp_matches_') and filename.endswith('.csv'):
        year_str = filename.replace('atp_matches_', '').replace('.csv', '')
        # Check if it's a valid year (numeric and >= 2005)
        try:
            year = int(year_str)
            if year >= 2005:
                filtered_files.append(f)
        except ValueError:
            # Skip non-year files like "amateur"
            continue

match_files = filtered_files

dfs = [pd.read_csv(f, low_memory=False) for f in match_files]
matches_df = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(matches_df):,} matches from {len(match_files)} files (doubles excluded, 2005+)")


Loaded 58,502 matches from 20 files (doubles excluded, 2005+)


In [13]:
# Create match-level dataframe with player A (smaller ID) and player B (larger ID)
# Match metadata columns
match_metadata = ['tourney_id', 'tourney_name', 'tourney_date', 'surface', 'round', 'draw_size']

# Player identifier columns (winner and loser)
winner_cols = ['winner_id', 'winner_name', 'winner_rank', 'winner_rank_points', 'winner_age', 'winner_ht']
loser_cols = ['loser_id', 'loser_name', 'loser_rank', 'loser_rank_points', 'loser_age', 'loser_ht']

# Create new dataframe
match_data = []

for idx, row in matches_df.iterrows():
    # Determine which player has smaller ID
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    
    # Skip if either ID is missing
    if pd.isna(winner_id) or pd.isna(loser_id):
        continue
    
    # Assign player A (smaller ID) and player B (larger ID)
    if winner_id < loser_id:
        player_A_cols = winner_cols
        player_B_cols = loser_cols
    else:
        player_A_cols = loser_cols
        player_B_cols = winner_cols
    
    # Create record
    record = {}
    
    # Match metadata
    for col in match_metadata:
        record[col] = row[col] if col in row else None
    
    # Player A identifiers
    record['player_A_id'] = row[player_A_cols[0]]
    record['player_A_name'] = row[player_A_cols[1]]
    record['player_A_rank'] = row[player_A_cols[2]]
    record['player_A_rank_points'] = row[player_A_cols[3]]
    record['player_A_age'] = row[player_A_cols[4]]
    record['player_A_ht'] = row[player_A_cols[5]]
    
    # Player B identifiers
    record['player_B_id'] = row[player_B_cols[0]]
    record['player_B_name'] = row[player_B_cols[1]]
    record['player_B_rank'] = row[player_B_cols[2]]
    record['player_B_rank_points'] = row[player_B_cols[3]]
    record['player_B_age'] = row[player_B_cols[4]]
    record['player_B_ht'] = row[player_B_cols[5]]
    
    # Calculate delta features (A - B)
    record['delta_rank'] = record['player_A_rank'] - record['player_B_rank'] if pd.notna(record['player_A_rank']) and pd.notna(record['player_B_rank']) else None
    record['delta_age'] = record['player_A_age'] - record['player_B_age'] if pd.notna(record['player_A_age']) and pd.notna(record['player_B_age']) else None
    record['delta_ht'] = record['player_A_ht'] - record['player_B_ht'] if pd.notna(record['player_A_ht']) and pd.notna(record['player_B_ht']) else None
    record['delta_rank_points'] = record['player_A_rank_points'] - record['player_B_rank_points'] if pd.notna(record['player_A_rank_points']) and pd.notna(record['player_B_rank_points']) else None
    
    # Target column: 1 if player A won, 0 if player B won
    record['target'] = 1 if winner_id == record['player_A_id'] else 0
    
    match_data.append(record)

# Create DataFrame
match_level_df = pd.DataFrame(match_data)

print(f"Created match-level dataframe with {len(match_level_df):,} records")
print(f"Columns: {len(match_level_df.columns)}")
print(f"\nColumn names:")
for i, col in enumerate(match_level_df.columns, 1):
    print(f"{i:2}. {col}")


Created match-level dataframe with 58,502 records
Columns: 23

Column names:
 1. tourney_id
 2. tourney_name
 3. tourney_date
 4. surface
 5. round
 6. draw_size
 7. player_A_id
 8. player_A_name
 9. player_A_rank
10. player_A_rank_points
11. player_A_age
12. player_A_ht
13. player_B_id
14. player_B_name
15. player_B_rank
16. player_B_rank_points
17. player_B_age
18. player_B_ht
19. delta_rank
20. delta_age
21. delta_ht
22. delta_rank_points
23. target


In [14]:
# Preview the dataframe
match_level_df.head(10)


Unnamed: 0,tourney_id,tourney_name,tourney_date,surface,round,draw_size,player_A_id,player_A_name,player_A_rank,player_A_rank_points,...,player_B_name,player_B_rank,player_B_rank_points,player_B_age,player_B_ht,delta_rank,delta_age,delta_ht,delta_rank_points,target
0,2005-1536,Madrid Masters,20051017,Hard,R64,48,102720,Tomas Zib,63.0,621.0,...,Victor Hanescu,42.0,827.0,24.2,198.0,21.0,5.5,-20.0,-206.0,0
1,2005-1536,Madrid Masters,20051017,Hard,R64,48,102845,Carlos Moya,33.0,1005.0,...,Filippo Volandri,41.0,836.0,24.1,183.0,-8.0,5.0,7.0,169.0,1
2,2005-1536,Madrid Masters,20051017,Hard,R64,48,102450,Tim Henman,26.0,1120.0,...,Taylor Dent,28.0,1115.0,24.4,188.0,-2.0,6.7,-3.0,5.0,1
3,2005-1536,Madrid Masters,20051017,Hard,R64,48,104022,Mikhail Youzhny,29.0,1090.0,...,Jose Acasuso,55.0,678.0,22.9,190.0,-26.0,0.4,-7.0,412.0,0
4,2005-1536,Madrid Masters,20051017,Hard,R64,48,103017,Nicolas Kiefer,30.0,1070.0,...,Karol Beck,45.0,771.0,23.5,180.0,-15.0,4.7,3.0,299.0,0
5,2005-1536,Madrid Masters,20051017,Hard,R64,48,103206,Sebastien Grosjean,34.0,1000.0,...,Feliciano Lopez,31.0,1030.0,24.0,188.0,3.0,3.3,-13.0,-30.0,1
6,2005-1536,Madrid Masters,20051017,Hard,R64,48,102610,Albert Costa,102.0,410.0,...,Agustin Calleri,61.0,632.0,29.0,183.0,41.0,1.3,-3.0,-222.0,0
7,2005-1536,Madrid Masters,20051017,Hard,R64,48,103018,Max Mirnyi,25.0,1125.0,...,Juan Carlos Ferrero,16.0,1425.0,25.6,183.0,9.0,2.6,13.0,-300.0,1
8,2005-1536,Madrid Masters,20051017,Hard,R64,48,102562,Jiri Novak,35.0,950.0,...,Alberto Martin,54.0,686.0,27.1,175.0,-19.0,3.4,15.0,264.0,0
9,2005-1536,Madrid Masters,20051017,Hard,R64,48,103694,Olivier Rochus,24.0,1135.0,...,Tomas Berdych,50.0,736.0,20.0,196.0,-26.0,4.7,-28.0,399.0,1


In [15]:
# Verify player A always has smaller ID than player B
print("Verifying player A ID < player B ID:")
print(f"All records valid: {(match_level_df['player_A_id'] < match_level_df['player_B_id']).all()}")
print(f"\nSample comparison:")
print(match_level_df[['player_A_id', 'player_B_id']].head())


Verifying player A ID < player B ID:
All records valid: True

Sample comparison:
   player_A_id  player_B_id
0       102720       103812
1       102845       103835
2       102450       103758
3       104022       104076
4       103017       103971


In [16]:
# Verify target distribution
print("Target distribution:")
print(match_level_df['target'].value_counts().sort_index())
print(f"\nPercentage of records with target = 1: {match_level_df['target'].mean():.2%}")
print(f"Expected: ~50% (since player A is assigned based on smaller ID, not win/loss)")


Target distribution:
target
0    29278
1    29224
Name: count, dtype: int64

Percentage of records with target = 1: 49.95%
Expected: ~50% (since player A is assigned based on smaller ID, not win/loss)


In [17]:
# Save the dataframe to CSV
output_dir = Path("../data/clean-match")
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "atp_matches_match_level_1.csv"
match_level_df.to_csv(output_path, index=False)

print(f"Saved to: {output_path}")
print(f"Records saved: {len(match_level_df):,}")


Saved to: ..\data\clean-match\atp_matches_match_level_1.csv
Records saved: 58,502
