In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import requests
from bs4 import BeautifulSoup

In [2]:

dfs = []

# Loop through years from 2010 to 2023
for year in range(2018, 2024):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

dfs

[     tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
 0     2018-M020     Brisbane    Hard         32             A      20180101   
 1     2018-M020     Brisbane    Hard         32             A      20180101   
 2     2018-M020     Brisbane    Hard         32             A      20180101   
 3     2018-M020     Brisbane    Hard         32             A      20180101   
 4     2018-M020     Brisbane    Hard         32             A      20180101   
 ...         ...          ...     ...        ...           ...           ...   
 2892  2018-9210    Laver Cup    Hard          8             A      20180921   
 2893  2018-9210    Laver Cup    Hard          8             A      20180921   
 2894  2018-9210    Laver Cup    Hard          8             A      20180921   
 2895  2018-9210    Laver Cup    Hard          8             A      20180921   
 2896  2018-9210    Laver Cup    Hard          8             A      20180921   
 
       match_num  winner_id  winner_se

In [3]:
# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.dropna()
merged_df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2018-M020,Brisbane,Hard,32,A,20180101,271,105992,,,...,47.0,33.0,19.0,14.0,1.0,4.0,47.0,1010.0,52.0,909.0
1,2018-M020,Brisbane,Hard,32,A,20180101,272,111577,,,...,41.0,25.0,7.0,9.0,7.0,11.0,54.0,890.0,94.0,593.0
2,2018-M020,Brisbane,Hard,32,A,20180101,273,104797,,,...,53.0,37.0,29.0,15.0,10.0,16.0,63.0,809.0,30.0,1391.0
3,2018-M020,Brisbane,Hard,32,A,20180101,275,200282,,WC,...,43.0,33.0,17.0,11.0,4.0,6.0,208.0,245.0,44.0,1055.0
4,2018-M020,Brisbane,Hard,32,A,20180101,276,111581,,Q,...,35.0,28.0,5.0,9.0,0.0,2.0,175.0,299.0,68.0,755.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15796,2023-M-DC-2023-WG2-PO-RSA-LUX-01,Davis Cup WG2 PO: RSA vs LUX,,4,D,20230204,5,202335,,,...,,,,,,,,,1717.0,1.0
15797,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,1,117365,,,...,,,,,,,990.0,11.0,279.0,190.0
15798,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,2,121411,,,...,,,,,,,364.0,131.0,894.0,15.0
15799,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,4,144949,,,...,,,,,,,894.0,15.0,285.0,184.0


In [4]:
# Dictionary of all players
player_profiles = {}

for index, row in merged_df.iterrows():
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        'winner': winner_id,
        'loser': loser_id
        # Add more match details as needed
    }

    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            'hand': 1 if row['winner_hand'] == 'R' else 0,
            'height': row['winner_ht'],
            'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0,
            'rank': -1
        }

    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            'hand': 1 if row['loser_hand'] == 'R' else 0,
            'height': row['loser_ht'],
            'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df,
            'rank': -1
        }

first_5_players = list(player_profiles.items())[:5]

for player_id, profile in first_5_players:
    print(profile['name'])

Ryan Harrison
Leonardo Mayer
Jared Donaldson
Jordan Thompson
Denis Istomin


In [5]:
def get_player_ranking(player_name):
    url = 'https://www.atptour.com/en/rankings/singles'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')

        
        rankings_table = soup.find('table', class_='mega-table')
        
        if rankings_table:
            rows = rankings_table.find_all('tr')[1:]

            
            for row in rows:
                player_cell = row.find('td', class_='player-cell')
                rank_cell = row.find('td', class_='rank-cell')
                
                if player_cell and rank_cell:
                    player_name_atp = player_cell.text.strip()
                    player_rank = rank_cell.text.strip()
                    
                    if player_name_atp == player_name:
                        return player_rank
                else:
                    print("Error: Unable to extract player or rank cell.")

            return "Not Ranked"
        else:
            print("Error: Unable to find rankings table.")
            return None
    
    else:
        print('Failed to retrieve ATP rankings:', response.status_code)
        return None


"""for player_id, profile in player_profiles.items():
    player_name = profile['name']
    print(player_name)
    
    ranking = get_player_ranking(player_name)
    
    profile['rank'] = ranking"""

"""for player_id, profile in list(player_profiles.items())[:5]:
    print(f"Player Name: {profile['name']}")
    print(f"ATP Ranking: {profile['rank']}")
    print()"""

'for player_id, profile in list(player_profiles.items())[:5]:\n    print(f"Player Name: {profile[\'name\']}")\n    print(f"ATP Ranking: {profile[\'rank\']}")\n    print()'

In [6]:
X = []
Y = []
for player_id, attributes in list(player_profiles.items()):
    matchesList = attributes['matches']
    for match in matchesList:
        x_components = []
        
        if match['winner'] == player_id:
            player2_id = match['loser']
        else:
            player2_id = match['winner']
        
        player2_profile = player_profiles[player2_id]
            
        if np.isnan(attributes['hand']) or np.isnan(attributes['height']) or np.isnan(attributes['age']) or np.isnan(player2_profile['hand']) or np.isnan(player2_profile['height']) or np.isnan(player2_profile['age']):
            continue
        else:
            # Player 1
            x_components.append(player_id)
            x_components.append(attributes['hand'])
            x_components.append(attributes['height'])
            #x_components.append(attributes['country'])
            x_components.append(attributes['age'])
        
            # Player 2
            x_components.append(player2_id)
            x_components.append(player2_profile['hand'])
            x_components.append(player2_profile['height'])
            #x_components.append(player2_profile['country'])
            x_components.append(player2_profile['age'])
        
            X.append(x_components)
            Y.append(match['winner'])
    

    
print("X:")
print(X)
print("Y:")
print(Y)
"""X = np.array(X)
Y = np.array(Y)"""

X:
[[105992, 1, 183.0, 25.6, 104919, 1, 188.0, 30.6], [105992, 1, 183.0, 25.6, 105870, 1, 193.0, 26.1], [105992, 1, 183.0, 25.6, 104797, 1, 188.0, 31.3], [105992, 1, 183.0, 25.6, 200282, 1, 183.0, 18.8], [105992, 1, 183.0, 25.6, 106401, 1, 193.0, 22.6], [105992, 1, 183.0, 25.6, 104534, 1, 175.0, 32.7], [105992, 1, 183.0, 25.6, 104655, 1, 180.0, 32.0], [105992, 1, 183.0, 25.6, 105227, 1, 198.0, 29.2], [105992, 1, 183.0, 25.6, 105385, 0, 183.0, 28.4], [105992, 1, 183.0, 25.6, 103333, 1, 208.0, 38.8], [105992, 1, 183.0, 25.6, 124187, 1, 211.0, 20.4], [105992, 1, 183.0, 25.6, 104545, 1, 206.0, 32.7], [105992, 1, 183.0, 25.6, 106043, 1, 170.0, 25.3], [105992, 1, 183.0, 25.6, 100644, 1, 198.0, 20.7], [105992, 1, 183.0, 25.6, 105643, 0, 190.0, 27.2], [105992, 1, 183.0, 25.6, 105311, 1, 185.0, 28.7], [105992, 1, 183.0, 25.6, 105074, 0, 183.0, 29.9], [105992, 1, 183.0, 25.6, 200175, 1, 183.0, 18.5], [105992, 1, 183.0, 25.6, 126203, 1, 193.0, 20.3], [105992, 1, 183.0, 25.6, 104198, 1, 188.0, 34.

'X = np.array(X)\nY = np.array(Y)'

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
Y_pred = logModel.predict(X_test)
Y_pred

array([126774, 126774, 126774, ..., 126774, 126774, 126774])

In [10]:
accuracyScore = accuracy_score(Y_test, Y_pred)
accuracyScore * 100

2.2103148024112524

In [11]:
for y_element in Y_pred:
    print(y_element)

126774
126774
126774
106421
126774
106421
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
106421
126774
126774
106421
126774
126774
126774
126774
106421
126774
126774
126774
106421
126774
106421
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
106421
126774
126774
126774
106421
126774
126774
126774
126774
126774
126774
126774
106421
126774
126774
106421
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
106421
126774
126774
126774
126774
126774
126774
126774
106421
126774
126774
126774
106421
106421
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774
126774