In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from tabulate import tabulate

In [2]:
matches = pd.read_csv('../data/raw/atp_matches_till_2022.csv')

In [3]:
df = matches[['tourney_id', 'surface', 'draw_size', 'tourney_level', 'tourney_date', 'match_num', 
        'winner_id', 'winner_seed', 'winner_entry', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
        'loser_id', 'loser_seed', 'loser_entry', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 
        'best_of', 'round',
        'winner_rank', 'loser_rank']]

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Laden der Daten
matches = pd.read_csv('../data/raw/atp_matches_till_2022.csv')

# Korrektur der initialen Datenbereinigung und -auswahl, um die IOC-Spalten einzubeziehen
relevant_columns_including_ioc = ['surface', 'tourney_level', 'winner_id', 'loser_id', 
                                  'winner_rank', 'loser_rank', 'winner_rank_points', 'loser_rank_points', 
                                  'winner_ioc', 'loser_ioc']
matches_cleaned_including_ioc = matches[relevant_columns_including_ioc].dropna()

# Funktion zur Symmetrisierung des Datensatzes
def symmetrize_dataset_corrected(data):
    sym_data = data.copy()
    sym_data['rank_difference'] = sym_data['winner_rank'] - sym_data['loser_rank']
    sym_data['points_difference'] = sym_data['winner_rank_points'] - sym_data['loser_rank_points']
    sym_data['player_wins'] = 1

    sym_data_opponent = sym_data.copy()
    sym_data_opponent[['winner_rank', 'loser_rank']] = sym_data_opponent[['loser_rank', 'winner_rank']]
    sym_data_opponent[['winner_rank_points', 'loser_rank_points']] = sym_data_opponent[['loser_rank_points', 'winner_rank_points']]
    sym_data_opponent[['winner_ioc', 'loser_ioc']] = sym_data_opponent[['loser_ioc', 'winner_ioc']]
    sym_data_opponent['rank_difference'] = -sym_data_opponent['rank_difference']
    sym_data_opponent['points_difference'] = -sym_data_opponent['points_difference']
    sym_data_opponent['player_wins'] = 0

    combined_sym_data = pd.concat([sym_data, sym_data_opponent], ignore_index=True)
    return combined_sym_data

# Anwendung der Funktion
symmetric_matches_corrected_including_ioc = symmetrize_dataset_corrected(matches_cleaned_including_ioc)

# Auswahl der Features und der Zielvariable
X_sym_corrected_including_ioc = symmetric_matches_corrected_including_ioc.drop(['winner_id', 'loser_id', 'player_wins'], axis=1)
y_sym_corrected_including_ioc = symmetric_matches_corrected_including_ioc['player_wins']

# Umwandlung von kategorischen Daten in numerische
X_sym_corrected_including_ioc = pd.get_dummies(X_sym_corrected_including_ioc, columns=['surface', 'tourney_level', 'winner_ioc', 'loser_ioc'])

# Aufteilung der Daten in Trainings- und Testsets
X_train_sym_corr_inc_ioc, X_test_sym_corr_inc_ioc, y_train_sym_corr_inc_ioc, y_test_sym_corr_inc_ioc = train_test_split(X_sym_corrected_including_ioc, y_sym_corrected_including_ioc, test_size=0.3, random_state=42)

# Skalierung der Features
scaler_sym_corr_inc_ioc = StandardScaler()
X_train_scaled_sym_corr_inc_ioc = scaler_sym_corr_inc_ioc.fit_transform(X_train_sym_corr_inc_ioc)
X_test_scaled_sym_corr_inc_ioc = scaler_sym_corr_inc_ioc.transform(X_test_sym_corr_inc_ioc)

# Modellerstellung und Training
model_sym_corr_inc_ioc = RandomForestClassifier(random_state=42)
model_sym_corr_inc_ioc.fit(X_train_scaled_sym_corr_inc_ioc, y_train_sym_corr_inc_ioc)

# Modellbewertung
accuracy_sym_corr_inc_ioc = model_sym_corr_inc_ioc.score(X_test_scaled_sym_corr_inc_ioc, y_test_sym_corr_inc_ioc)


In [6]:
print(f"Accuracy Trainingsdaten: {model_sym_corr_inc_ioc.score(X_train_sym_corr_inc_ioc, y_train_sym_corr_inc_ioc)}")
print(f"Accuracy Testdaten: {model_sym_corr_inc_ioc.score(X_test_sym_corr_inc_ioc, y_test_sym_corr_inc_ioc)}")



Accuracy Trainingsdaten: 0.6492097859664607




Accuracy Testdaten: 0.6486147311516556


In [5]:
# Ermittlung der Feature-Importance-Werte
feature_importances = model_sym_corr_inc_ioc.feature_importances_

# Zuordnung der Feature-Namen zu ihren Importance-Werten
features = X_sym_corrected_including_ioc.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sortierung der Features nach ihrer Bedeutung
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
5,points_difference,1.265456e-01
4,rank_difference,1.250532e-01
3,loser_rank_points,1.062899e-01
2,winner_rank_points,1.054560e-01
0,winner_rank,1.005678e-01
...,...,...
134,loser_ioc_AZE,1.793270e-07
237,loser_ioc_ZAM,1.498182e-07
125,winner_ioc_ZAM,9.251935e-08
138,loser_ioc_BEN,0.000000e+00
