In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [18]:
# df10 = pd.read_csv("ATP_Matches/atp_matches_2010.csv")
# df11 = pd.read_csv("ATP_Matches/atp_matches_2011.csv")

# dfs = {}

# # Loop through years from 2010 to 2023
# for year in range(2010, 2024):
#     filename = f"ATP_Matches/atp_matches_{year}.csv"
#     dfs[f'df{year}'] = pd.read_csv(filename)


# Initialize an empty list to store DataFrames
dfs = []

# Loop through years from 2010 to 2023
for year in range(2022, 2023):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Initialize an empty dictionary to store player profiles
player_profiles = {}

# Iterate through each row in the DataFrame
for index, row in merged_df.iterrows():
    # Extract player IDs and match details from the row
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        # Add more match details as needed
    }

    # Calculate total aces and double faults, handling missing values
    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    # Update winner's profile or create a new profile if it doesn't exist
    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            'hand': row['winner_hand'],
            'height': row['winner_ht'],
            'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0
        }

    # Update loser's profile or create a new profile if it doesn't exist
    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            'hand': row['loser_hand'],
            'height': row['loser_ht'],
            'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df
        }

# Example: Print profiles of the first 5 players
for player_id, profile in list(player_profiles.items())[:5]:
    print(f"Player ID: {player_id}")
    print(f"Name: {profile['name']}")
    print(f"Hand: {profile['hand']}")
    print(f"Height: {profile['height']}")
    print(f"Country: {profile['country']}")
    print(f"Age: {profile['age']}")
    print(f"Total Winning Aces: {profile['total_w_aces']}")
    print(f"Total Winning Double Faults: {profile['total_w_df']}")
    print(f"Total Losing Aces: {profile['total_l_aces']}")
    print(f"Total Losing Double Faults: {profile['total_l_df']}")
    print("Matches:")
    for match in profile['matches']:
        print(f"\tTournament: {match['tournament']}, Surface: {match['surface']}, Score: {match['score']}, Minutes: {match['minutes']}")
    print()



Player ID: 200000
Name: Felix Auger Aliassime
Hand: R
Height: 193.0
Country: CAN
Age: 21.4
Total Winning Aces: 604.0
Total Winning Double Faults: 171.0
Total Losing Aces: 270.0
Total Losing Double Faults: 101.0
Matches:
	Tournament: Atp Cup, Surface: Hard, Score: 7-6(3) 6-3, Minutes: 129.0
	Tournament: Atp Cup, Surface: Hard, Score: 6-4 6-0, Minutes: 68.0
	Tournament: Atp Cup, Surface: Hard, Score: 6-4 4-6 6-3, Minutes: 151.0
	Tournament: Atp Cup, Surface: Hard, Score: 7-6(4) 6-3, Minutes: 119.0
	Tournament: Atp Cup, Surface: Hard, Score: 6-7(6) 6-4 6-4, Minutes: 147.0
	Tournament: Australian Open, Surface: Hard, Score: 6-4 0-6 3-6 6-3 6-4, Minutes: 220.0
	Tournament: Australian Open, Surface: Hard, Score: 7-6(4) 6-7(4) 7-6(5) 7-6(4), Minutes: 259.0
	Tournament: Australian Open, Surface: Hard, Score: 6-4 6-1 6-1, Minutes: 113.0
	Tournament: Australian Open, Surface: Hard, Score: 2-6 7-6(7) 6-2 7-6(4), Minutes: 215.0
	Tournament: Australian Open, Surface: Hard, Score: 6-7(4) 3-6 7-6(2) 

In [8]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [9]:
importantData = df[['tourney_name', 'surface', 'tourney_level', 
                    'winner_name', 'winner_age', 
                    'loser_name', 'loser_age', 
                    'score', ]]
importantData = importantData.dropna()
importantData

Unnamed: 0,tourney_name,surface,tourney_level,winner_name,winner_age,loser_name,loser_age,score
0,United Cup,Hard,A,Taylor Fritz,25.1,Matteo Berrettini,26.7,7-6(4) 7-6(6)
1,United Cup,Hard,A,Frances Tiafoe,24.9,Lorenzo Musetti,20.8,6-2 0-0 RET
2,United Cup,Hard,A,Taylor Fritz,25.1,Hubert Hurkacz,25.8,7-6(5) 7-6(5)
3,United Cup,Hard,A,Frances Tiafoe,24.9,Kacper Zuk,23.9,6-3 6-3
4,United Cup,Hard,A,Stefanos Tsitsipas,24.3,Matteo Berrettini,26.7,4-6 7-6(2) 6-4
...,...,...,...,...,...,...,...,...
2951,Davis Cup WG2 R1: SLO vs LUX,Clay,D,Blaz Rola,32.9,Alex Knaff,25.7,6-4 6-2
2952,Davis Cup WG2 R1: SLO vs LUX,Clay,D,Chris Rodesch,22.1,Bor Artnak,19.2,6-2 7-6(3)
2953,Davis Cup WG2 R1: URU vs EGY,Clay,D,Mohamed Safwat,32.9,Ignacio Carou,24.1,7-5 6-3
2954,Davis Cup WG2 R1: URU vs EGY,Clay,D,Karim Mohamed Maamoun,32.4,Franco Roncadelli,23.5,6-1 4-6 7-5


In [6]:
X = importantData[['tourney_name', 'surface', 'tourney_level']]
Y = importantData[['winner_name']]

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [23]:
logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

ValueError: could not convert string to float: 'South Orange'