In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import requests
from bs4 import BeautifulSoup

In [None]:
# df10 = pd.read_csv("ATP_Matches/atp_matches_2010.csv")
# df11 = pd.read_csv("ATP_Matches/atp_matches_2011.csv")

# dfs = {}

# # Loop through years from 2010 to 2023
# for year in range(2010, 2024):
#     filename = f"ATP_Matches/atp_matches_{year}.csv"
#     dfs[f'df{year}'] = pd.read_csv(filename)


# Initialize an empty list to store DataFrames
dfs = []

# Loop through years from 2010 to 2023
for year in range(2018, 2024):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

dfs

In [None]:
# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.dropna()
merged_df

In [None]:
# Dictionary of all players
player_profiles = {}

for index, row in merged_df.iterrows():
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        'winner': winner_id,
        'loser': loser_id
        # Add more match details as needed
    }

    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            #'hand': row['winner_hand'],
            'height': row['winner_ht'],
            #'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0,
            'rank': -1
        }

    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            #'hand': row['loser_hand'],
            'height': row['loser_ht'],
            #'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df,
            'rank': -1
        }

for player_id, profile in player_profiles.items(:5):
    print(profile['name'])

In [None]:
# Now to add the rankings
def get_player_ranking(player_name):
    url = 'https://www.atptour.com/en/rankings/singles'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')

        
        rankings_table = soup.find('table', class_='mega-table')
        
        # Check if the rankings table was found
        if rankings_table:
            rows = rankings_table.find_all('tr')[1:]

            
            for row in rows:
                # Extract player name and ranking from the row
                player_cell = row.find('td', class_='player-cell')
                rank_cell = row.find('td', class_='rank-cell')
                
                if player_cell and rank_cell:
                    player_name_atp = player_cell.text.strip()
                    player_rank = rank_cell.text.strip()
                    
                    # Check if the player name matches
                    if player_name_atp == player_name:
                        return player_rank
                else:
                    print("Error: Unable to extract player or rank cell.")

            return "Not Ranked"
        else:
            print("Error: Unable to find rankings table.")
            return None
    
    else:
        print('Failed to retrieve ATP rankings:', response.status_code)
        return None


# Iterate through each player in player_profiles
for player_id, profile in player_profiles.items():
    # Get the player's name
    player_name = profile['name']
    print(player_name)
    
    # Get the player's ATP ranking
    ranking = get_player_ranking(player_name)
    
    # Update the player's profile with the ATP ranking
    profile['rank'] = ranking

# Example: Print profiles of the first 5 players with their ATP ranking
for player_id, profile in list(player_profiles.items())[:5]:
    print(f"Player Name: {profile['name']}")
    print(f"ATP Ranking: {profile['rank']}")
    print()

In [None]:
X = []
Y = []
for player_id, attributes in list(player_profiles.items()):
    matchesList = attributes['matches']
    for match in matchesList:
        x_components = []
        # Player 1
        x_components.append(player_id)
        #x_components.append(attributes['hand'])
        x_components.append(attributes['height'])
        #x_components.append(attributes['country'])
        x_components.append(attributes['age'])
        
        # Player 2
        if match['winner'] == player_id:
            player2_id = match['loser']
        else:
            player2_id = match['winner']
        player2_profile = player_profiles[player2_id]
        x_components.append(player2_id)
        #x_components.append(player2_profile['hand'])
        x_components.append(player2_profile['height'])
        #x_components.append(player2_profile['country'])
        x_components.append(player2_profile['age'])
        
        X.append(x_components)
        Y.append(match['winner'])
    

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
#encoder = OneHotEncoder(drop='first', sparse=False)
#X_encoded = encoder.fit_transform(X_train)

logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
#X_test_encoded = encoder.transform(X_test)
Y_pred = logModel.predict(X_test)
Y_pred