In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
pd.set_option('display.max_columns', None)

In [None]:
def get_all_atp_matches():
    allFiles = np.sort(glob.glob("../Data/tennis_atp/atp_matches_" + "????.csv"))[:]
    match = pd.DataFrame()
    list_ = list()
    for filen in allFiles:
        df = pd.read_csv(filen,
                         parse_dates=[5])
        list_.append(df)
    match = pd.concat(list_)
    return match

def plot_nans(df, get_null_cols=False):
    print("df: ", df.shape)
    null_count = df.isna().sum()
    null_cols = null_count[null_count > 0]
    print('nNulls: ', null_cols.shape)
    sns.set_theme(font_scale=0.8)
    sns.barplot(x=null_cols.values, y=null_cols.index)
    plt.ylabel('Columns has NaN')
    plt.show()
    
    if get_null_cols:
        return null_cols

df = get_all_atp_matches()

In [None]:
df.head(2)

<span style="margin-left:38vw"></span>**Grand Slams 1968 - 2024**

In [None]:
grand_slams_df = df.query('tourney_level == "G" & best_of == 5').reset_index(drop=True)
grand_slams_df.winner_name.value_counts()

In [None]:
grand_slams_df.query('winner_name == "Novak Djokovic"')['winner_age'].min()

In [None]:
winner_min_age = grand_slams_df.groupby('winner_id')['winner_age'].min()
loser_min_age = grand_slams_df.groupby('loser_name')['loser_age'].min()

grand_slams_df[['winner_id', 'winner_name','winner_age']]


In [None]:
gdf = pd.DataFrame()
gdf['surface'] = grand_slams_df['surface']
gdf['round'] = grand_slams_df['round']
gdf['height_diff'] = grand_slams_df['winner_ht'] - grand_slams_df['loser_ht']
gdf['age_diff'] =  grand_slams_df['winner_age'] - grand_slams_df['loser_age']
gdf['rank_diff'] = grand_slams_df['loser_rank'] - grand_slams_df['winner_rank']
gdf['seed_diff'] = grand_slams_df['loser_seed'] - grand_slams_df['winner_seed']
gdf['winner_exp'] = grand_slams_df['winner_age'] - 17
gdf['loser_exp'] = grand_slams_df['loser_age'] - 17

plot_nans(gdf)

## Handling NaN's

In [None]:
null_cols = plot_nans(grand_slams_df, get_null_cols=True)

In [None]:
grand_slams_df = grand_slams_df.dropna(subset=null_cols[null_cols < 10500].index).reset_index(drop=True)

In [None]:
plot_nans(grand_slams_df )

In [None]:
# filling NaN with 0 as newer players are not seeded
grand_slams_df.fillna({'winner_seed':0, 'loser_seed':0}, inplace=True)
plot_nans(grand_slams_df)

*Checking how similar NaN's in winner/loser entry are*

In [None]:
count1 = grand_slams_df['winner_entry'].value_counts(dropna=False)
count2 = grand_slams_df['loser_entry'].value_counts(dropna=False)

counts_df = pd.concat([count1, count2], axis=1)
counts_df.columns = ['winner_entry', 'loser_entry']

# Plot side-by-side bars
counts_df.plot(kind='bar')
plt.ylabel('Count')
plt.title('Value Counts of winner/loser entry')
plt.xticks(rotation=0)
plt.show()


In [None]:
grand_slams_df.drop(columns=['winner_entry', 'loser_entry'], inplace=True)

#### Analyzing Minutes

In [None]:
print('nNulls: ',grand_slams_df.minutes.isna().sum())
print('kurtosis: ', grand_slams_df.minutes.kurtosis()) # not many outliers
print(grand_slams_df.minutes.describe())
grand_slams_df.minutes.plot(kind='kde')
plt.show()

In [None]:
import plotly.express as px

fig = px.box(grand_slams_df, y="minutes")
fig.update_layout(
    autosize=False,
    width=600,  
    height=400
)
fig.show()

In [None]:
# imputing NaN's in minutes with mean
grand_slams_df.fillna({'minutes': grand_slams_df['minutes'].mean()}, inplace=True)
grand_slams_df.isna().any().any() # if nans left or not

<span style="margin-left:40vw"></span>**Hahha! Data cleansed finally**

In [None]:
import numpy as np

def random_impute(col):
    missing = col.isna()
    col[missing] = np.random.choice(col[~missing], size=missing.sum(), replace=True)
    return col
    random_impute       
random_impute(grand_slams_df['minutes'])
