In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
# df10 = pd.read_csv("ATP_Matches/atp_matches_2010.csv")
# df11 = pd.read_csv("ATP_Matches/atp_matches_2011.csv")

# dfs = {}

# # Loop through years from 2010 to 2023
# for year in range(2010, 2024):
#     filename = f"ATP_Matches/atp_matches_{year}.csv"
#     dfs[f'df{year}'] = pd.read_csv(filename)


# Initialize an empty list to store DataFrames
dfs = []

# Loop through years from 2010 to 2023
for year in range(2010, 2024):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

dfs

[     tourney_id                tourney_name surface  draw_size tourney_level  \
 0      2010-339                    Brisbane    Hard         32             A   
 1      2010-339                    Brisbane    Hard         32             A   
 2      2010-339                    Brisbane    Hard         32             A   
 3      2010-339                    Brisbane    Hard         32             A   
 4      2010-339                    Brisbane    Hard         32             A   
 ...         ...                         ...     ...        ...           ...   
 3025  2010-D045  Davis Cup WG F: SRB vs FRA    Hard          4             D   
 3026  2010-D045  Davis Cup WG F: SRB vs FRA    Hard          4             D   
 3027   2010-615                  Dusseldorf    Clay         32             A   
 3028   2010-615                  Dusseldorf    Clay         32             A   
 3029   2010-615                  Dusseldorf    Clay         32             A   
 
       tourney_date  match

In [3]:
# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.dropna()
merged_df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2010-339,Brisbane,Hard,32,A,20100103,1,104053,1.0,,...,34.0,29.0,11.0,10.0,3.0,5.0,7.0,4410.0,77.0,598.0
1,2010-339,Brisbane,Hard,32,A,20100103,2,104958,,WC,...,34.0,22.0,14.0,9.0,7.0,10.0,134.0,400.0,78.0,590.0
2,2010-339,Brisbane,Hard,32,A,20100103,3,104755,,,...,58.0,38.0,14.0,14.0,7.0,11.0,52.0,850.0,88.0,568.0
3,2010-339,Brisbane,Hard,32,A,20100103,4,105051,,Q,...,29.0,16.0,15.0,9.0,2.0,5.0,285.0,151.0,28.0,1260.0
4,2010-339,Brisbane,Hard,32,A,20100103,5,104607,4.0,,...,41.0,26.0,14.0,9.0,6.0,9.0,20.0,1655.0,251.0,179.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39490,2023-M-DC-2023-WG2-PO-RSA-LUX-01,Davis Cup WG2 PO: RSA vs LUX,,4,D,20230204,5,202335,,,...,,,,,,,,,1717.0,1.0
39491,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,1,117365,,,...,,,,,,,990.0,11.0,279.0,190.0
39492,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,2,121411,,,...,,,,,,,364.0,131.0,894.0,15.0
39493,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,4,144949,,,...,,,,,,,894.0,15.0,285.0,184.0


In [4]:
# Initialize an empty dictionary to store player profiles
player_profiles = {}

# Iterate through each row in the DataFrame
for index, row in merged_df.iterrows():
    # Extract player IDs and match details from the row
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        'winner': winner_id,
        'loser': loser_id
        # Add more match details as needed
    }

    # Calculate total aces and double faults, handling missing values
    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    # Update winner's profile or create a new profile if it doesn't exist
    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            #'hand': row['winner_hand'],
            'height': row['winner_ht'],
            #'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0
        }

    # Update loser's profile or create a new profile if it doesn't exist
    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            #'hand': row['loser_hand'],
            'height': row['loser_ht'],
            #'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df
        }

# Example: Print profiles of the first 5 players
"""for player_id, profile in list(player_profiles.items())[:5]:
    print(f"Player ID: {player_id}")
    print(f"Name: {profile['name']}")
    print(f"Hand: {profile['hand']}")
    print(f"Height: {profile['height']}")
    print(f"Country: {profile['country']}")
    print(f"Age: {profile['age']}")
    print(f"Total Winning Aces: {profile['total_w_aces']}")
    print(f"Total Winning Double Faults: {profile['total_w_df']}")
    print(f"Total Losing Aces: {profile['total_l_aces']}")
    print(f"Total Losing Double Faults: {profile['total_l_df']}")
    print("Matches:")
    for match in profile['matches']:
        print(f"\tTournament: {match['tournament']}, Surface: {match['surface']}, Score: {match['score']}, Minutes: {match['minutes']}")
    print()"""
    
list(player_profiles.items())[:5]

[(104053,
  {'name': 'Andy Roddick',
   'height': 188.0,
   'age': 27.3,
   'matches': [{'tournament': 'Brisbane',
     'surface': 'Hard',
     'score': '7-6(5) 6-2',
     'minutes': 84.0,
     'winner': 104053,
     'loser': 103429},
    {'tournament': 'Brisbane',
     'surface': 'Hard',
     'score': '7-6(0) 6-3',
     'minutes': 81.0,
     'winner': 104053,
     'loser': 104958},
    {'tournament': 'Brisbane',
     'surface': 'Hard',
     'score': '6-3 7-6(5)',
     'minutes': 85.0,
     'winner': 104053,
     'loser': 104755},
    {'tournament': 'Brisbane',
     'surface': 'Hard',
     'score': '1-6 6-3 6-4',
     'minutes': 128.0,
     'winner': 104053,
     'loser': 104607},
    {'tournament': 'Brisbane',
     'surface': 'Hard',
     'score': '7-6(2) 7-6(7)',
     'minutes': 125.0,
     'winner': 104053,
     'loser': 103285},
    {'tournament': 'Australian Open',
     'surface': 'Hard',
     'score': '6-1 6-4 6-4',
     'minutes': 124.0,
     'winner': 104053,
     'loser': 1052

In [5]:
X = []
Y = []
for player_id, attributes in list(player_profiles.items()):
    matchesList = attributes['matches']
    for match in matchesList:
        x_components = []
        # Player 1
        x_components.append(player_id)
        #x_components.append(attributes['hand'])
        x_components.append(attributes['height'])
        #x_components.append(attributes['country'])
        x_components.append(attributes['age'])
        
        # Player 2
        if match['winner'] == player_id:
            player2_id = match['loser']
        else:
            player2_id = match['winner']
        player2_profile = player_profiles[player2_id]
        x_components.append(player2_id)
        #x_components.append(player2_profile['hand'])
        x_components.append(player2_profile['height'])
        #x_components.append(player2_profile['country'])
        x_components.append(player2_profile['age'])
        
        X.append(x_components)
        Y.append(match['winner'])
    

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
#encoder = OneHotEncoder(drop='first', sparse=False)
#X_encoded = encoder.fit_transform(X_train)

logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
#X_test_encoded = encoder.transform(X_test)
Y_pred = logModel.predict(X_test)
Y_pred