In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import requests
from bs4 import BeautifulSoup

In [2]:
dfs = []

# Loop through years from 2021 to 2023
for year in range(2021, 2024):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

dfs

[     tourney_id    tourney_name surface  draw_size tourney_level  \
 0     2021-0096  Tokyo Olympics    Hard         64             A   
 1     2021-0096  Tokyo Olympics    Hard         64             A   
 2     2021-0096  Tokyo Olympics    Hard         64             A   
 3     2021-0096  Tokyo Olympics    Hard         64             A   
 4     2021-0096  Tokyo Olympics    Hard         64             A   
 ...         ...             ...     ...        ...           ...   
 2728  2021-9210       Laver Cup    Hard          8             A   
 2729  2021-9210       Laver Cup    Hard          8             A   
 2730  2021-9210       Laver Cup    Hard          8             A   
 2731  2021-9210       Laver Cup    Hard          8             A   
 2732  2021-9210       Laver Cup    Hard          8             A   
 
       tourney_date  match_num  winner_id  winner_seed winner_entry  ...  \
 0         20210724        237     126207          NaN          NaN  ...   
 1         2021072

In [3]:
# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.dropna()
merged_df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2021-0096,Tokyo Olympics,Hard,64,A,20210724,237,126207,,,...,,,,,,,53.0,1228.0,71.0,996.0
1,2021-0096,Tokyo Olympics,Hard,64,A,20210724,238,105526,,,...,,,,,,,48.0,1410.0,95.0,829.0
2,2021-0096,Tokyo Olympics,Hard,64,A,20210724,239,111576,,,...,,,,,,,160.0,471.0,197.0,354.0
3,2021-0096,Tokyo Olympics,Hard,64,A,20210724,240,105357,,,...,,,,,,,44.0,1476.0,61.0,1106.0
4,2021-0096,Tokyo Olympics,Hard,64,A,20210724,241,207830,,,...,,,,,,,145.0,525.0,137.0,570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,2023-M-DC-2023-WG2-PO-RSA-LUX-01,Davis Cup WG2 PO: RSA vs LUX,,4,D,20230204,5,202335,,,...,,,,,,,,,1717.0,1.0
8632,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,1,117365,,,...,,,,,,,990.0,11.0,279.0,190.0
8633,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,2,121411,,,...,,,,,,,364.0,131.0,894.0,15.0
8634,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,4,144949,,,...,,,,,,,894.0,15.0,285.0,184.0


In [4]:
# Dictionary of all players
player_profiles = {}

for index, row in merged_df.iterrows():
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        'winner': winner_id,
        'loser': loser_id
        # Add more match details as needed
    }

    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            'hand': 1 if row['winner_hand'] == 'R' else 0,
            'height': row['winner_ht'],
            'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0,
            'rank': -1
        }

    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            'hand': 1 if row['loser_hand'] == 'R' else 0,
            'height': row['loser_ht'],
            'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df,
            'rank': -1
        }

first_5_players = list(player_profiles.items())[:5]

for player_id, profile in first_5_players:
    print(profile['name'])

Frances Tiafoe
Soon Woo Kwon
Jan Lennard Struff
Thiago Monteiro
Sumit Nagal


In [5]:
# def get_player_ranking(player_name):
#     url = 'https://www.atptour.com/en/rankings/singles'
    
#     # Send a GET request to the ATP rankings page
#     response = requests.get(url)
    
#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the HTML content of the page
#         soup = BeautifulSoup(response.content, 'html.parser')
        
#         # Find the table containing the rankings
#         rankings_table = soup.find('table', class_='mobile-table mega-table non-live')
        
#         # Check if the rankings table was found
#         if rankings_table:
#             # Find all rows in the table (skipping the header row)
#             rows = rankings_table.find_all('tr')[1:]
            
#             # Iterate over each row
#             for row in rows:
#                 # Extract player name and ranking from the row
#                 player_name_atp = row.find('td', class_='player-cell').find('a').text.strip()
#                 player_rank = row.find('td', class_='rank-cell').text.strip()
                
#                 # Check if the player name matches
#                 if player_name_atp == player_name:
#                     return player_rank
            
#             # If the player's name is not found in the rankings
#             return "Not Ranked"
#         else:
#             print("Error: Unable to find rankings table.")
#             return None
    
#     else:
#         print('Failed to retrieve ATP rankings:', response.status_code)
#         return None


# for player_id, profile in player_profiles.items():
#     player_name = profile['name']
#     print(player_name)
    
#     ranking = get_player_ranking(player_name)
    
#     profile['rank'] = ranking

# for player_id, profile in list(player_profiles.items())[:5]:
#     print(f"Player Name: {profile['name']}")
#     print(f"ATP Ranking: {profile['rank']}")
#     print()

def get_player_ranking(player_name):
    url = 'https://live-tennis.eu/en/atp-live-ranking'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        rankings_table = soup.find('table', class_='mobile-table mega-table non-live')
        
        if rankings_table:
            
            rows = rankings_table.find_all('tr')[1:]
            
            for row in rows:
                player_name_atp = row.find('td', class_='player-cell').find('a').text.strip()
                player_rank = row.find('td', class_='rank-cell').text.strip()
                
                if player_name_atp == player_name:
                    return player_rank
            
            return "Not Ranked"
        else:
            print("Error: Unable to find rankings table.")
            return None
    
    else:
        print('Failed to retrieve ATP rankings:', response.status_code)
        return None


In [6]:
X = []
Y = []
for player_id, attributes in list(player_profiles.items()):
    matchesList = attributes['matches']
    for match in matchesList:
        x_components = []
        
        if match['winner'] == player_id:
            player2_id = match['loser']
        else:
            player2_id = match['winner']
        
        player2_profile = player_profiles[player2_id]
            
        if np.isnan(attributes['hand']) or np.isnan(attributes['height']) or np.isnan(attributes['age']) or np.isnan(player2_profile['hand']) or np.isnan(player2_profile['height']) or np.isnan(player2_profile['age']):
            continue
        else:
            # Player 1
            x_components.append(player_id)
            x_components.append(attributes['hand'])
            x_components.append(attributes['height'])
            #x_components.append(attributes['country'])
            x_components.append(attributes['age'])
        
            # Player 2
            x_components.append(player2_id)
            x_components.append(player2_profile['hand'])
            x_components.append(player2_profile['height'])
            #x_components.append(player2_profile['country'])
            x_components.append(player2_profile['age'])
        
            X.append(x_components)
            Y.append(match['winner'])
    

    
print("X:")
print(X)
print("Y:")
print(Y)
"""X = np.array(X)
Y = np.array(Y)"""

X:
[[126207, 1, 188.0, 23.4, 126952, 1, 180.0, 23.6], [126207, 1, 188.0, 23.4, 126774, 1, 193.0, 22.9], [126207, 1, 188.0, 23.4, 105379, 1, 181.0, 31.9], [126207, 1, 188.0, 23.4, 104678, 1, 193.0, 35.3], [126207, 1, 188.0, 23.4, 133430, 0, 185.0, 22.1], [126207, 1, 188.0, 23.4, 105583, 1, 180.0, 30.8], [126207, 1, 188.0, 23.4, 126774, 1, 193.0, 22.9], [126207, 1, 188.0, 23.4, 106043, 1, 170.0, 28.9], [126207, 1, 188.0, 23.4, 206173, 1, 188.0, 19.8], [126207, 1, 188.0, 23.4, 100644, 1, 198.0, 24.2], [126207, 1, 188.0, 23.4, 106218, 1, 180.0, 27.9], [126207, 1, 188.0, 23.4, 105882, 1, 185.0, 29.4], [126207, 1, 188.0, 23.4, 105554, 1, 175.0, 31.0], [126207, 1, 188.0, 23.4, 105583, 1, 180.0, 30.8], [126207, 1, 188.0, 23.4, 106421, 1, 198.0, 25.4], [126207, 1, 188.0, 23.4, 105332, 1, 196.0, 32.1], [126207, 1, 188.0, 23.4, 200624, 1, 196.0, 20.8], [126207, 1, 188.0, 23.4, 128034, 1, 196.0, 24.4], [126207, 1, 188.0, 23.4, 202385, 1, 188.0, 20.7], [126207, 1, 188.0, 23.4, 106415, 0, 170.0, 25.

'X = np.array(X)\nY = np.array(Y)'

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
Y_pred = logModel.predict(X_test)
Y_pred

array([206173, 206173, 206173, ..., 206173, 206173, 206173])

In [10]:
accuracyScore = accuracy_score(Y_test, Y_pred)
accuracyScore * 100

1.831386169876855

In [11]:
for y_element in Y_pred:
    print(y_element)

206173
206173
206173
206173
206173
206173
207989
206173
207989
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
207989
206173
206173
206173
206173
207989
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
207989
206173
206173
206173
206173
206173
207989
206173
207989
206173
206173
206173
206173
206173
206173
206173
206173
206173
207989
207989
207989
206173
206173
206173
206173
206173
206173
207989
206173
206173
206173
206173
206173
206173
206173
207989
206173
206173
206173
207989
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
206173
207989
206173
207989
206173
206173
206173
207989
206173
206173
206173
207989
206173
206173
206173
206173
206173
206173
207989
206173
206173
206173
206173
206173
207989
206173
206173
206173
206173
206173
206173
207989
206173
207989
206173
206173
207989
206173
206173
207989
206173
206173
206173
206173