In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


In [None]:
df = pd.read_csv("processed/zheng_career_matches.csv")
df = df.drop(['BPSaved', 'Time'], axis=1).dropna()

# column names
playerNames = 'playerNames'
df.rename(columns={'Unnamed: 6': playerNames}, inplace=True)
df

In [None]:
surface = 'Surface'
score = 'Score'
dateStr = 'Date'
dateObj = 'dateObj'
sampleSize = 'sampleSize'

# opponents' rank
vRk = 'vRk'
# ace minus df
serveNetGain = 'serveNetGain'
aceRate = 'A%'
dfRate = 'DF%'
firstServeIn = '1stIn'
firstServeWin = '1st%'
secondServeWin = '2nd%'
win = 'win'
# player name
playerOfInterest = 'zheng'
# surface type
grass = 'Grass'
hard = 'Hard'
clay = 'Clay'
retire = 'RET'
# group frequency
freq = 'M'

In [None]:
# filter data
df[dateStr] = df[dateStr].str.replace('\u2011', '-')
df[dateObj] = pd.to_datetime(df[dateStr], format='%d-%b-%Y') 
df = df[df[dateObj].dt.year >= 2022]
# remove RET
df = df[~df[score].str.contains(retire)]
# remove Grass
df = df[~df[surface].str.contains(grass)]
df

In [None]:
# convert to float
cols_to_convert = [aceRate, dfRate, firstServeIn, firstServeWin, secondServeWin]

for col in cols_to_convert:
        df[col] = df[col].str.rstrip('%').astype('float') / 100.0
  
# add more attributes
df[serveNetGain] = df[aceRate] - df[dfRate]
df[win] = df[playerNames].str.split('d.').str[0].str.contains(playerOfInterest)

# partition
top15df = df[df[vRk]<=15]
top50df = df[(df[vRk]<=50) & (df[vRk]>15)]
othersDf = df[df[vRk]>50]

print(f"Num of matches: hard: {len(df[df[surface] == hard])}, clay: {len(df[df[surface] == clay])}")
print(f"vs. top 15: {len(top15df)}, hard: {len(top15df[top15df[surface] == hard])}, clay: {len(top15df[top15df[surface] == clay])}")
print(f"vs. top 50: {len(top50df)}, hard: {len(top50df[top50df[surface] == hard])}, clay: {len(top50df[top50df[surface] == clay])}")
print(f"vs. others: {len(othersDf)}, hard: {len(othersDf[othersDf[surface] == hard])}, clay: {len(othersDf[othersDf[surface] == clay])}")
df


In [None]:
# group by month
def getGroupedDf(df, freq=freq):
    groupedDf = df.groupby(pd.Grouper(key=dateObj, freq=freq)).mean().dropna()
    groupedDf[dateObj] = pd.to_datetime(groupedDf.index.get_level_values(0).astype(str), format='%Y-%m')
    groupedDf[sampleSize] = df.groupby(pd.Grouper(key=dateObj, freq=freq)).size()
    return groupedDf


grouped15 = getGroupedDf(top15df)
grouped15Clay = getGroupedDf(top15df[top15df[surface] == clay])
grouped15Hard = getGroupedDf(top15df[top15df[surface] == hard])

grouped50Clay = getGroupedDf(top50df)
grouped50Clay = getGroupedDf(top50df[top50df[surface] == clay])
grouped50Hard = getGroupedDf(top50df[top50df[surface] == hard])

groupedOthersClay = getGroupedDf(othersDf)
groupedOthersClay = getGroupedDf(othersDf[othersDf[surface] == clay])
groupedOthersHard = getGroupedDf(othersDf[othersDf[surface] == hard])


In [None]:
def plotScatterByGroups(groupedDfArr,attributeOfInterest, title,  hline=False):
    labelArr=['vs .top 15', 'vs. top 50', 'vs. others']
    xLabel=dateStr
    yLabel='Percentage'
    dotScale = 10
    PERC = 100
    #plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(10, 5))  # Optional: Set the figure size
    for i in range(len(groupedDfArr)):
      groupedDf = groupedDfArr[i]
      label = labelArr[i]
      ax.scatter(groupedDf[dateObj], groupedDf[attributeOfInterest] * PERC, label=label, s=dotScale * groupedDf[sampleSize])
      ax.plot(groupedDf[dateObj],  groupedDf[attributeOfInterest] * PERC, '--') 
    ax.set_xlabel(xLabel)  # Label for x-axis
    ax.set_ylabel(yLabel)  # Label for y-axis
    ax.set_title(title)  # Title of the plot
    #ax.grid(True)
    if hline:
        ax.axhline(0, color='black') 
    ax.legend()  # Show the legend
    plt.show()  # Display the plot
    

In [None]:
title = 'Qinwen Zheng\' 1st serve in: Clay'
attributeOfInterest = firstServeIn
groupedDfArr = [grouped15Clay, grouped50Clay, groupedOthersClay]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)

In [None]:
title = 'Qinwen Zheng\' 1st serve in: Hard'
attributeOfInterest = firstServeIn
groupedDfArr = [grouped15Hard, grouped50Hard, groupedOthersHard]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)

In [None]:
title = 'Qinwen Zheng\' 1st serve win: Clay'
attributeOfInterest = firstServeWin
groupedDfArr = [grouped15Clay, grouped50Clay, groupedOthersClay]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)

In [None]:
title = 'Qinwen Zheng\' 1st serve win: Hard'
attributeOfInterest = firstServeWin
groupedDfArr = [grouped15Hard, grouped50Hard, groupedOthersHard]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)

In [None]:
title = 'Qinwen Zheng\' 2nd serve win: Clay'
attributeOfInterest = secondServeWin
groupedDfArr = [grouped15Clay, grouped50Clay, groupedOthersClay]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)

In [None]:
title = 'Qinwen Zheng\' 2nd serve win: Hard'
attributeOfInterest = secondServeWin
groupedDfArr = [grouped15Hard, grouped50Hard, groupedOthersHard]
plotScatterByGroups(groupedDfArr, attributeOfInterest, title)