# Imports

In [257]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dataframes

In [258]:
PATH_EVENTS = "data/events.csv"
PATH_POINTS = "data/points.csv"
PATH_RALLIES = "data/rallies.csv"
PATH_SERVES = "data/serves.csv"

df_events = pd.read_csv(PATH_EVENTS)
df_points = pd.read_csv(PATH_POINTS)
df_rallies = pd.read_csv(PATH_RALLIES)
df_serves = pd.read_csv(PATH_SERVES)

df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92


# Data sanitisation

No need :)

# Questions:
- How does the velocity of the ball change per game/set per player? 
- What is the distribution like, if any?

## Calculating Velocity

### Create new column for elapsed time during the single rally (time it took for ball to go to receiver from hitter)
- It subtracts the time from the previous row

In [259]:
df_events["elapsed_time"] = df_events["time"].diff()

### Calculate single velocity instance by dividing change in position (Euclidian distance) by change in time.

In [260]:
df_events["velocity (yard/second) "] = np.sqrt(np.square(df_events["hitter_x"] - df_events["receiver_x"]) + np.square(df_events["hitter_y"] - df_events["receiver_y"])) / df_events["elapsed_time"]
df_events["elapsed_time"].fillna(0, inplace=True)
df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time,elapsed_time,velocity (yard/second)
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0,0.0,
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92,2.0,12.174392


In [261]:
df_events.head(3)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time,elapsed_time,velocity (yard/second)
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0,0.0,
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92,0.92,27.427616
2,2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,1.42,2.33,4.75,26.45,2.92,2.0,12.174392


### Dropping some columns

In [262]:
df_events_velocity = df_events.drop(["hitter_x","hitter_y","receiver_x","receiver_y", "Unnamed: 0"], axis=1)
df_events_velocity.head(3)

Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,time,elapsed_time,velocity (yard/second)
0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,0.0,0.0,
1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.92,0.92,27.427616
2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,2.92,2.0,12.174392


**Note: We can ignore the velocities of services because we instantiate the initial velocity is 0. From this data, we have no way of calculating the velocity of a service**

In [263]:
ARG_VELOCITY_NO_SERVE = df_events_velocity["isserve"] == False
df_events_velocity_rally = df_events_velocity[ARG_VELOCITY_NO_SERVE]
df_events_velocity_rally.head(3)

Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,time,elapsed_time,velocity (yard/second)
1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.92,0.92,27.427616
2,1,70950,3,Djokovic,Nadal,False,first,topspin,forehand,2.92,2.0,12.174392
5,3,71820,2,Nadal,Djokovic,False,second,slice,forehand,37.72,0.96,25.80612


In [267]:
df_points = df_points.drop(["Unnamed: 0"], axis=1)
df_points.head(3)

Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96,"0:0, 15:0"
1,3,Djokovic,Nadal,Djokovic,out,second,4,4.16,3.33,-0.39,"0:0, 30:0"
2,5,Djokovic,Nadal,Djokovic,ace,second,2,0.4,1.62,17.18,"0:0, 40:0"


In [265]:
df_points.tail(3)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
139,201,202,Nadal,Djokovic,Djokovic,winner,first,4,3.08,1.88,3.36,"6:3 6:2 5:3, 40:15"
140,203,204,Nadal,Djokovic,Nadal,out,first,12,0.84,8.45,-0.56,"6:3 6:2 5:3, 40:30"
141,205,206,Nadal,Djokovic,Djokovic,out,second,5,4.6,0.59,23.86,"6:3 6:2 6:3, 0:0"


## Adding new columns for game number and set number

In [278]:
def get_set_num(item):
    """ 
        Returns the set number
    """
    
#     for i in range(len(item)):
#         if item[i] == ",":
#             if len(item[:i]) == 3:
#                 return 1
#             elif len(item[:i]) == 7:
#                 return 2
#             elif len(item[:i]) == 11:
#                 return 3
    
    game_item = item.split(",")[0]

    # get set
    if game_item.count(":") == 1:
        return 1
    elif game_item.count(":") == 2:
        return 2
    else:
        return 3
            

def get_game_num(item):
    """
        Returns the game number
    """
    
    game_item = item.split(",")[0]

    first_game, second_game = game_item[-3:].split(":")
    return (int(first_game) + int(second_game) + 1)

# using .apply to pass "set" and "game" as an iterable for all elements of "score" series.
df_points["set"] = df_points['score'].apply(get_set_num)
df_points["game"] = df_points['score'].apply(get_game_num)

df_points.tail()


0:0, 15:0
0:0, 30:0
0:0, 40:0
1:0, 0:0
1:0, 15:0
1:0, 30:0
1:0, 30:15
1:0, 40:15
2:0, 0:0
2:0, 15:0
2:0, 30:0
2:0, 40:0
3:0, 0:0
3:0, 15:0
3:0, 15:15
3:0, 30:15
3:0, 30:30
3:0, 40:30
3:0, 40:40
3:0, 40:Ad
3:1, 0:0
3:1, 15:0
3:1, 30:0
3:1, 40:0
4:1, 0:0
4:1, 15:0
4:1, 15:15
4:1, 15:30
4:1, 30:30
4:1, 30:40
4:2, 0:0
4:2, 15:0
4:2, 30:0
4:2, 40:0
5:2, 0:0
5:2, 0:15
5:2, 0:30
5:2, 0:40
5:3, 0:0
5:3, 0:15
5:3, 15:15
5:3, 30:15
5:3, 40:15
6:3 0:0, 0:0
6:3 0:0, 15:0
6:3 0:0, 15:15
6:3 0:0, 15:30
6:3 0:0, 30:30
6:3 0:0, 30:40
6:3 0:1, 0:0
6:3 0:1, 15:0
6:3 0:1, 30:0
6:3 0:1, 40:0
6:3 1:1, 0:0
6:3 1:1, 0:15
6:3 1:1, 15:15
6:3 1:1, 30:15
6:3 1:1, 30:30
6:3 1:1, 30:40
6:3 1:2, 0:0
6:3 1:2, 15:0
6:3 1:2, 15:15
6:3 1:2, 30:15
6:3 1:2, 40:15
6:3 2:2, 0:0
6:3 2:2, 15:0
6:3 2:2, 30:0
6:3 2:2, 30:15
6:3 2:2, 40:15
6:3 3:2, 0:0
6:3 3:2, 0:15
6:3 3:2, 15:15
6:3 3:2, 15:30
6:3 3:2, 30:30
6:3 3:2, 40:30
6:3 3:2, 40:40
6:3 3:2, Ad:40
6:3 3:2, 40:40
6:3 3:2, Ad:40
6:3 4:2, 0:0
6:3 4:2, 0:15
6:3 4:2, 15:15
6:

Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score,set,game
137,199,Nadal,Djokovic,Djokovic,winner,first,6,6.6,2.44,4.71,"6:3 6:2 5:3, 15:15",3,9
138,201,Nadal,Djokovic,Djokovic,net,first,9,9.0,4.26,11.57,"6:3 6:2 5:3, 30:15",3,9
139,202,Nadal,Djokovic,Djokovic,winner,first,4,3.08,1.88,3.36,"6:3 6:2 5:3, 40:15",3,9
140,204,Nadal,Djokovic,Nadal,out,first,12,0.84,8.45,-0.56,"6:3 6:2 5:3, 40:30",3,9
141,206,Nadal,Djokovic,Djokovic,out,second,5,4.6,0.59,23.86,"6:3 6:2 6:3, 0:0",3,10


In [272]:
# Merge df's so I can get games and sets for rallyid's
# seperate between sets and games
# plot average velocity of ball per game for both players
# plot average velocity of ball per set for both players
# See difference and distribution