In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import requests
from bs4 import BeautifulSoup

In [7]:
dfs = []

# Loop through years from 2021 to 2023
for year in range(2021, 2024):
    filename = f"ATP_Matches/atp_matches_{year}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

dfs

[     tourney_id    tourney_name surface  draw_size tourney_level  \
 0     2021-0096  Tokyo Olympics    Hard         64             A   
 1     2021-0096  Tokyo Olympics    Hard         64             A   
 2     2021-0096  Tokyo Olympics    Hard         64             A   
 3     2021-0096  Tokyo Olympics    Hard         64             A   
 4     2021-0096  Tokyo Olympics    Hard         64             A   
 ...         ...             ...     ...        ...           ...   
 2728  2021-9210       Laver Cup    Hard          8             A   
 2729  2021-9210       Laver Cup    Hard          8             A   
 2730  2021-9210       Laver Cup    Hard          8             A   
 2731  2021-9210       Laver Cup    Hard          8             A   
 2732  2021-9210       Laver Cup    Hard          8             A   
 
       tourney_date  match_num  winner_id  winner_seed winner_entry  ...  \
 0         20210724        237     126207          NaN          NaN  ...   
 1         2021072

In [8]:
# Load the merged DataFrame
merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.dropna()
merged_df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2021-0096,Tokyo Olympics,Hard,64,A,20210724,237,126207,,,...,,,,,,,53.0,1228.0,71.0,996.0
1,2021-0096,Tokyo Olympics,Hard,64,A,20210724,238,105526,,,...,,,,,,,48.0,1410.0,95.0,829.0
2,2021-0096,Tokyo Olympics,Hard,64,A,20210724,239,111576,,,...,,,,,,,160.0,471.0,197.0,354.0
3,2021-0096,Tokyo Olympics,Hard,64,A,20210724,240,105357,,,...,,,,,,,44.0,1476.0,61.0,1106.0
4,2021-0096,Tokyo Olympics,Hard,64,A,20210724,241,207830,,,...,,,,,,,145.0,525.0,137.0,570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,2023-M-DC-2023-WG2-PO-RSA-LUX-01,Davis Cup WG2 PO: RSA vs LUX,,4,D,20230204,5,202335,,,...,,,,,,,,,1717.0,1.0
8632,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,1,117365,,,...,,,,,,,990.0,11.0,279.0,190.0
8633,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,2,121411,,,...,,,,,,,364.0,131.0,894.0,15.0
8634,2023-M-DC-2023-WG2-PO-TUN-CYP-01,Davis Cup WG2 PO: TUN vs CYP,,4,D,20230203,4,144949,,,...,,,,,,,894.0,15.0,285.0,184.0


In [9]:
# Dictionary of all players
player_profiles = {}

for index, row in merged_df.iterrows():
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    match_details = {
        'tournament': row['tourney_name'],
        'surface': row['surface'],
        'score': row['score'],
        'minutes': row['minutes'],
        'winner': winner_id,
        'loser': loser_id
        # Add more match details as needed
    }

    total_w_aces = row['w_ace'] if not pd.isnull(row['w_ace']) else 0
    total_w_df = row['w_df'] if not pd.isnull(row['w_df']) else 0
    total_l_aces = row['l_ace'] if not pd.isnull(row['l_ace']) else 0
    total_l_df = row['l_df'] if not pd.isnull(row['l_df']) else 0

    if winner_id in player_profiles:
        player_profiles[winner_id]['matches'].append(match_details)
        player_profiles[winner_id]['total_w_aces'] += total_w_aces
        player_profiles[winner_id]['total_w_df'] += total_w_df
    else:
        player_profiles[winner_id] = {
            'name': row['winner_name'],
            'hand': 1 if row['winner_hand'] == 'R' else 0,
            'height': row['winner_ht'],
            'country': row['winner_ioc'],
            'age': row['winner_age'],
            'matches': [match_details],
            'total_w_aces': total_w_aces,
            'total_w_df': total_w_df,
            'total_l_aces': 0,
            'total_l_df': 0,
            'rank': -1
        }

    if loser_id in player_profiles:
        player_profiles[loser_id]['matches'].append(match_details)
        player_profiles[loser_id]['total_l_aces'] += total_l_aces
        player_profiles[loser_id]['total_l_df'] += total_l_df
    else:
        player_profiles[loser_id] = {
            'name': row['loser_name'],
            'hand': 1 if row['loser_hand'] == 'R' else 0,
            'height': row['loser_ht'],
            'country': row['loser_ioc'],
            'age': row['loser_age'],
            'matches': [match_details],
            'total_w_aces': 0,
            'total_w_df': 0,
            'total_l_aces': total_l_aces,
            'total_l_df': total_l_df,
            'rank': -1
        }

first_5_players = list(player_profiles.items())[:5]

for player_id, profile in first_5_players:
    print(profile['name'])

Frances Tiafoe
Soon Woo Kwon
Jan Lennard Struff
Thiago Monteiro
Sumit Nagal


In [12]:
# def get_player_ranking(player_name):
#     url = 'https://www.atptour.com/en/rankings/singles'
    
#     # Send a GET request to the ATP rankings page
#     response = requests.get(url)
    
#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the HTML content of the page
#         soup = BeautifulSoup(response.content, 'html.parser')
        
#         # Find the table containing the rankings
#         rankings_table = soup.find('table', class_='mobile-table mega-table non-live')
        
#         # Check if the rankings table was found
#         if rankings_table:
#             # Find all rows in the table (skipping the header row)
#             rows = rankings_table.find_all('tr')[1:]
            
#             # Iterate over each row
#             for row in rows:
#                 # Extract player name and ranking from the row
#                 player_name_atp = row.find('td', class_='player-cell').find('a').text.strip()
#                 player_rank = row.find('td', class_='rank-cell').text.strip()
                
#                 # Check if the player name matches
#                 if player_name_atp == player_name:
#                     return player_rank
            
#             # If the player's name is not found in the rankings
#             return "Not Ranked"
#         else:
#             print("Error: Unable to find rankings table.")
#             return None
    
#     else:
#         print('Failed to retrieve ATP rankings:', response.status_code)
#         return None


# for player_id, profile in player_profiles.items():
#     player_name = profile['name']
#     print(player_name)
    
#     ranking = get_player_ranking(player_name)
    
#     profile['rank'] = ranking

# for player_id, profile in list(player_profiles.items())[:5]:
#     print(f"Player Name: {profile['name']}")
#     print(f"ATP Ranking: {profile['rank']}")
#     print()

def get_player_ranking(player_name):
    url = 'https://www.atptour.com/en/rankings/singles?RankRange=0-5000&Region=all&DateWeek=Current%20Week'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.text, 'html.parser') 
        
        rankings_table = soup.find('table', class_='mobile-table mega-table non-live')
        
        if rankings_table:
            
            rows = rankings_table.find_all('tr')[1:]
            
            for row in rows:
                last_name = row.find('span', class_='lastName')
                if last_name is not None:
                    player_name_atp = last_name.text.strip()
                    rank = row.find('td', class_='rank bold heavy tiny-cell')
                    if rank is not None:
                        player_rank = rank.text.strip()
                        print(player_name_atp, ':', player_rank)
                
                if player_name_atp == player_name:
                    return player_rank
            
            return "Not Ranked"
        else:
            print("Error: Unable to find rankings table.")
            return None
    
    else:
        print('Failed to retrieve ATP rankings:', response.status_code)
        return None

print(get_player_ranking("A. Kim"))


N. Djokovic : 1
C. Alcaraz : 2
J. Sinner : 3
D. Medvedev : 4
A. Zverev : 5
A. Rublev : 6
H. Rune : 7
C. Ruud : 8
H. Hurkacz : 9
A. de Minaur : 10
S. Tsitsipas : 11
G. Dimitrov : 12
T. Fritz : 13
T. Paul : 14
U. Humbert : 15
K. Khachanov : 16
B. Shelton : 17
A. Bublik : 18
S. Baez : 19
A. Mannarino : 20
F. Cerundolo : 21
F. Tiafoe : 22
N. Jarry : 23
L. Musetti : 24
J. Struff : 25
T. Griekspoor : 26
J. Lehecka : 27
A. Davidovich Fokina : 28
S. Korda : 29
T. Etcheverry : 30
C. Norrie : 31
C. Eubanks : 32
B. Coric : 33
J. Thompson : 34
L. Djere : 35
F. Auger-Aliassime : 36
A. Fils : 37
M. Arnaldi : 38
R. Safiullin : 39
S. Ofner : 40
A. Karatsev : 41
J. Draper : 42
D. Evans : 43
A. Tabilo : 44
A. Popyrin : 45
M. Kecmanovic : 46
G. Monfils : 47
E. Ruusuvuori : 48
Z. Zhang : 49
D. Koepfer : 50
M. Giron : 51
D. Lajovic : 52
L. Sonego : 53
D. Altmaier : 54
F. Diaz Acosta : 55
Y. Hanfmann : 56
F. Marozsan : 57
A. Shevchenko : 58
M. Navone : 59
T. Machac : 60
N. Borges : 61
A. Murray : 62
F. Cobo

In [None]:
X = []
Y = []
for player_id, attributes in list(player_profiles.items()):
    matchesList = attributes['matches']
    for match in matchesList:
        x_components = []
        
        if match['winner'] == player_id:
            player2_id = match['loser']
        else:
            player2_id = match['winner']
        
        player2_profile = player_profiles[player2_id]
            
        if np.isnan(attributes['hand']) or np.isnan(attributes['height']) or np.isnan(attributes['age']) or np.isnan(player2_profile['hand']) or np.isnan(player2_profile['height']) or np.isnan(player2_profile['age']):
            continue
        else:
            # Player 1
            x_components.append(player_id)
            x_components.append(attributes['hand'])
            x_components.append(attributes['height'])
            #x_components.append(attributes['country'])
            x_components.append(attributes['age'])
        
            # Player 2
            x_components.append(player2_id)
            x_components.append(player2_profile['hand'])
            x_components.append(player2_profile['height'])
            #x_components.append(player2_profile['country'])
            x_components.append(player2_profile['age'])
        
            X.append(x_components)
            Y.append(match['winner'])
    

    
print("X:")
print(X)
print("Y:")
print(Y)
"""X = np.array(X)
Y = np.array(Y)"""

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
logModel = LogisticRegression()
logModel.fit(X_train, Y_train)

In [None]:
Y_pred = logModel.predict(X_test)
Y_pred

In [None]:
accuracyScore = accuracy_score(Y_test, Y_pred)
accuracyScore * 100

In [None]:
for y_element in Y_pred:
    print(y_element)