# Imports

In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dataframes

In [159]:
PATH_EVENTS = "data/events.csv"
PATH_POINTS = "data/points.csv"
PATH_RALLIES = "data/rallies.csv"
PATH_SERVES = "data/serves.csv"

df_events = pd.read_csv(PATH_EVENTS)
df_points = pd.read_csv(PATH_POINTS)
df_rallies = pd.read_csv(PATH_RALLIES)
df_serves = pd.read_csv(PATH_SERVES)

df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92


# Questions:
- How does the velocity of the ball change per game/set per player? 
- What is the distribution like?

## Calculating Velocity

### Create new column for elapsed time during the single rally (time it took for ball to go to receiver from hitter)
- It subtracts the time from the previous row

In [160]:
df_events["elapsed_time"] = df_events["time"].diff()

### Calculate single velocity instance by dividing change in position (Euclidian distance) by change in time.

In [192]:
df_events["velocity (yard/second) "] = np.sqrt(np.square(df_events["hitter_x"] - df_events["receiver_x"]) + np.square(df_events["hitter_y"] - df_events["receiver_y"])) / df_events["elapsed_time"]
df_events["elapsed_time"].fillna(0, inplace=True)
df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time,elapsed_time,velocity (yard/second)
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0,0.0,inf
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92,2.0,12.174392


In [162]:
df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time,elapsed_time,velocity (yard/second)
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0,,
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92,2.0,12.174392


### Dropping some columns

In [193]:
df_events_velocity = df_events.drop(["hitter_x","hitter_y","receiver_x","receiver_y"], axis=1)
df_events_velocity.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,time,elapsed_time,velocity (yard/second)
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,0.0,0.0,inf
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,2.92,2.0,12.174392


**Note: We can ignore the velocities of services because we instantiate the initial velocity is 0. From this data, we have no way of calculating the velocity of a service**

In [194]:
ARG_VELOCITY_NO_SERVE = df_events_velocity["isserve"] == False
df_events_velocity_rally = df_events_velocity[ARG_VELOCITY_NO_SERVE]
df_events_velocity_rally.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,time,elapsed_time,velocity (yard/second)
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,2.92,2.0,12.174392
5,5,3,71820,2,Nadal,Djokovic,False,second,slice,forehand,37.72,0.96,25.80612


In [190]:
df_points.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
0,0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96,"0:0, 15:0"
1,2,3,Djokovic,Nadal,Djokovic,out,second,4,4.16,3.33,-0.39,"0:0, 30:0"
2,4,5,Djokovic,Nadal,Djokovic,ace,second,2,0.4,1.62,17.18,"0:0, 40:0"


In [201]:
df_points.tail(15)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
127,185,186,Nadal,Djokovic,Nadal,net,second,2,0.8,6.39,13.0,"6:3 6:2 4:3, 0:0"
128,187,188,Djokovic,Nadal,Nadal,out,second,3,1.08,1.23,5.55,"6:3 6:2 4:3, 0:15"
129,188,189,Djokovic,Nadal,Djokovic,winner,first,7,7.44,6.57,5.75,"6:3 6:2 4:3, 15:15"
130,189,190,Djokovic,Nadal,Djokovic,out,first,2,0.8,-1.48,22.8,"6:3 6:2 4:3, 30:15"
131,190,191,Djokovic,Nadal,Nadal,out,first,7,3.32,10.23,3.78,"6:3 6:2 4:3, 30:30"
132,192,193,Djokovic,Nadal,Djokovic,net,first,4,0.76,6.48,13.42,"6:3 6:2 4:3, 40:30"
133,193,194,Djokovic,Nadal,Nadal,out,first,17,14.08,9.36,0.78,"6:3 6:2 4:3, 40:40"
134,194,195,Djokovic,Nadal,Djokovic,out,first,2,0.88,-2.11,21.55,"6:3 6:2 4:3, Ad:40"
135,195,196,Djokovic,,Djokovic,winner,first,7,7.96,6.58,12.03,"6:3 6:2 5:3, 0:0"
136,197,198,Nadal,Djokovic,Nadal,out,first,2,0.68,5.53,-0.02,"6:3 6:2 5:3, 0:15"


## Adding new columns for game number and set number

In [None]:
def get_set(string):
    """ 
        Returns the set number
    """
    
    for i in range(len(string)):
        if string[i] == ",":
            if len(string[:i]) == 3:
                return 1
            elif len(string[:i]) == 7:
                return 2
            elif len(string[:i]) == 11:
                return 3
            

def get_set(string):
    """
        Returns the game number
    """
    
    for i in range(len(string)):
        if string[i] == ",":
            sub_string = string[:i]
            
            game_num
            
            for j in sub_string:
                
                

In [165]:
df_rallies.head(5)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y
0,0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96
1,1,2,Djokovic,Nadal,__undefined__,second_serve,first,1,0.0,7.42,12.1
2,2,3,Djokovic,Nadal,Djokovic,out,second,4,4.16,3.33,-0.39
3,3,4,Djokovic,Nadal,__undefined__,second_serve,first,1,0.0,4.64,17.69
4,4,5,Djokovic,Nadal,Djokovic,ace,second,2,0.4,1.62,17.18


In [166]:
df_serves.head(2)

Unnamed: 0.1,Unnamed: 0,rallyid,server,x,y
0,0,1,Djokovic,1.86,16.8
1,1,3,Djokovic,7.05,16.97


# Data sanitisation

No need :)

# Analysis

The eventual winner was Djokovic, lets have a look at the discrepencies in stats