# Load and Inspect ATP Match Data

This notebook loads all ATP match data files and provides an initial exploration of the dataset structure.


In [2]:
import pandas as pd
from pathlib import Path
import glob

# Path to the ATP match data files
data_path = Path("../data/tennis_atp-master")

# Load all atp_matches_*.csv files
match_files = sorted(glob.glob(str(data_path / "atp_matches_*.csv")))

print(f"Found {len(match_files)} match files:")
for f in match_files:
    print(f"  - {Path(f).name}")


Found 160 match files:
  - atp_matches_1968.csv
  - atp_matches_1969.csv
  - atp_matches_1970.csv
  - atp_matches_1971.csv
  - atp_matches_1972.csv
  - atp_matches_1973.csv
  - atp_matches_1974.csv
  - atp_matches_1975.csv
  - atp_matches_1976.csv
  - atp_matches_1977.csv
  - atp_matches_1978.csv
  - atp_matches_1979.csv
  - atp_matches_1980.csv
  - atp_matches_1981.csv
  - atp_matches_1982.csv
  - atp_matches_1983.csv
  - atp_matches_1984.csv
  - atp_matches_1985.csv
  - atp_matches_1986.csv
  - atp_matches_1987.csv
  - atp_matches_1988.csv
  - atp_matches_1989.csv
  - atp_matches_1990.csv
  - atp_matches_1991.csv
  - atp_matches_1992.csv
  - atp_matches_1993.csv
  - atp_matches_1994.csv
  - atp_matches_1995.csv
  - atp_matches_1996.csv
  - atp_matches_1997.csv
  - atp_matches_1998.csv
  - atp_matches_1999.csv
  - atp_matches_2000.csv
  - atp_matches_2001.csv
  - atp_matches_2002.csv
  - atp_matches_2003.csv
  - atp_matches_2004.csv
  - atp_matches_2005.csv
  - atp_matches_2006.csv
  

In [3]:
# Load and concatenate all match files into a single DataFrame
dfs = []
for file in match_files:
    df = pd.read_csv(file, low_memory=False)
    dfs.append(df)

matches_df = pd.concat(dfs, ignore_index=True)

print(f"Combined DataFrame shape: {matches_df.shape}")
print(f"  - {matches_df.shape[0]:,} rows (matches)")
print(f"  - {matches_df.shape[1]} columns")


Combined DataFrame shape: (968048, 81)
  - 968,048 rows (matches)
  - 81 columns


In [4]:
# Display all column names
print("Column names:")
print("-" * 40)
for i, col in enumerate(matches_df.columns, 1):
    print(f"{i:2}. {col}")


Column names:
----------------------------------------
 1. tourney_id
 2. tourney_name
 3. surface
 4. draw_size
 5. tourney_level
 6. tourney_date
 7. match_num
 8. winner_id
 9. winner_seed
10. winner_entry
11. winner_name
12. winner_hand
13. winner_ht
14. winner_ioc
15. winner_age
16. loser_id
17. loser_seed
18. loser_entry
19. loser_name
20. loser_hand
21. loser_ht
22. loser_ioc
23. loser_age
24. score
25. best_of
26. round
27. minutes
28. w_ace
29. w_df
30. w_svpt
31. w_1stIn
32. w_1stWon
33. w_2ndWon
34. w_SvGms
35. w_bpSaved
36. w_bpFaced
37. l_ace
38. l_df
39. l_svpt
40. l_1stIn
41. l_1stWon
42. l_2ndWon
43. l_SvGms
44. l_bpSaved
45. l_bpFaced
46. winner_rank
47. winner_rank_points
48. loser_rank
49. loser_rank_points
50. winner1_id
51. winner2_id
52. loser1_id
53. loser2_id
54. winner1_name
55. winner1_hand
56. winner1_ht
57. winner1_ioc
58. winner1_age
59. winner2_name
60. winner2_hand
61. winner2_ht
62. winner2_ioc
63. winner2_age
64. loser1_name
65. loser1_hand
66. loser1_h

In [5]:
# Preview first few rows
matches_df.head()


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,loser2_ioc,loser2_age,winner1_rank,winner1_rank_points,winner2_rank,winner2_rank_points,loser1_rank,loser1_rank_points,loser2_rank,loser2_rank_points
0,1968-2029,Dublin,Grass,32.0,A,19680708.0,270,112411.0,,,...,,,,,,,,,,
1,1968-2029,Dublin,Grass,32.0,A,19680708.0,271,126914.0,,,...,,,,,,,,,,
2,1968-2029,Dublin,Grass,32.0,A,19680708.0,272,209523.0,,,...,,,,,,,,,,
3,1968-2029,Dublin,Grass,32.0,A,19680708.0,273,100084.0,,,...,,,,,,,,,,
4,1968-2029,Dublin,Grass,32.0,A,19680708.0,274,100132.0,,,...,,,,,,,,,,


In [6]:
# Data types and missing values summary
print("Data types and missing values:")
print("-" * 60)
info_df = pd.DataFrame({
    'dtype': matches_df.dtypes,
    'non_null': matches_df.count(),
    'null_count': matches_df.isnull().sum(),
    'null_pct': (matches_df.isnull().sum() / len(matches_df) * 100).round(2)
})
info_df


Data types and missing values:
------------------------------------------------------------


Unnamed: 0,dtype,non_null,null_count,null_pct
tourney_id,object,965625,2423,0.25
tourney_name,object,968048,0,0.00
surface,object,963355,4693,0.48
draw_size,object,967213,835,0.09
tourney_level,object,968048,0,0.00
...,...,...,...,...
winner2_rank_points,float64,17955,950093,98.15
loser1_rank,float64,17955,950093,98.15
loser1_rank_points,float64,17944,950104,98.15
loser2_rank,float64,17944,950104,98.15
