# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dataframes

In [2]:
PATH_EVENTS = "data/events.csv"
PATH_POINTS = "data/points.csv"
PATH_RALLIES = "data/rallies.csv"
PATH_SERVES = "data/serves.csv"

df_events = pd.read_csv(PATH_EVENTS)
df_points = pd.read_csv(PATH_POINTS)
df_rallies = pd.read_csv(PATH_RALLIES)
df_serves = pd.read_csv(PATH_SERVES)

df_events.head(2)

Unnamed: 0.1,Unnamed: 0,rallyid,frameid,strokeid,hitter,receiver,isserve,serve,type,stroke,hitter_x,hitter_y,receiver_x,receiver_y,time
0,0,1,70877,1,Djokovic,Nadal,True,first,serve,forehand,6.5,-0.24,1.03,27.44,0.0
1,1,1,70900,2,Nadal,Djokovic,False,first,slice,backhand,0.05,25.59,6.17,1.11,0.92


In [3]:
df_points.head(2)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
0,0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96,"0:0, 15:0"
1,2,3,Djokovic,Nadal,Djokovic,out,second,4,4.16,3.33,-0.39,"0:0, 30:0"


In [4]:
df_rallies.head(2)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y
0,0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96
1,1,2,Djokovic,Nadal,__undefined__,second_serve,first,1,0.0,7.42,12.1


In [5]:
df_serves.head(2)

Unnamed: 0.1,Unnamed: 0,rallyid,server,x,y
0,0,1,Djokovic,1.86,16.8
1,1,3,Djokovic,7.05,16.97


# Data sanitisation

No need :)

# Analysis

The eventual winner was Djokovic, lets have a look at the discrepencies in stats

## Questions (Part 1) - Serves
- how many ace's did Nadal get?
- how many points did Nadal win off of his serves?
- how does this compare for Djokovic? 
- **was serving even a significant factor in determining the outcome of this game?**
    
## Questions (Part 2) - Serves
- If serving was significant, in what way? Just the power of the serve? The way the returner tried to receive? 
    - **=> Part 3A**
- If serving was NOT significant, were they then largely determined by rallies?
    - How many strokes are required for it to be a win by rally? I will consider it to be 3. **WHY?** 
        - **=> Part 3B**

## Questions (Part 3A) - Serves & Returns
- What were the positions of returns from serves where the server won the point?
- What were the positions of returns from serves where the receiver won the point?
- **Do these have any correlation to resulting win/loss of the match?**

## Questions (Part 3B) - Rallies
- What were the common types of returns for Nadal and Djokovic?
- **What were the "winning" positions (x,y) for the points won?**
- **What were the "winning" hands? i.e. were they mainly forehands or backhands?** 


In [7]:
ARG_FIRST_SERVE = (df_points["serve"] == "first")
df_first_serve = df_points[ARG_FIRST_SERVE]

ARG_SERVER_NADAL = df_first_serve["server"] == "Nadal"
ARG_SERVER_DJOK = df_first_serve["server"] == "Djokovic"

df_first_serve_nadal = df_first_serve[ARG_SERVER_NADAL]
df_first_serve_djok = df_first_serve[ARG_SERVER_DJOK]

df_first_serve_nadal.head(5)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
5,8,9,Nadal,Djokovic,Djokovic,winner,first,12,0.92,3.65,9.5,"1:0, 30:0"
6,10,11,Nadal,Djokovic,Nadal,winner,first,4,1.08,7.34,18.76,"1:0, 30:15"
8,13,14,Nadal,Djokovic,Djokovic,out,first,3,2.64,1.07,25.85,"2:0, 0:0"
13,19,20,Nadal,Djokovic,Djokovic,winner,first,12,12.92,0.53,21.45,"3:0, 15:0"
15,23,24,Nadal,Djokovic,Djokovic,winner,first,4,0.72,7.3,22.58,"3:0, 30:15"


In [10]:
df_first_serve_djok.head(5)

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
0,0,1,Djokovic,Nadal,Djokovic,winner,first,3,0.92,1.92,21.96,"0:0, 15:0"
3,5,6,Djokovic,Nadal,Djokovic,net,first,10,10.4,6.41,12.78,"1:0, 0:0"
9,14,15,Djokovic,Nadal,Djokovic,out,first,2,0.8,4.39,27.45,"2:0, 15:0"
10,16,17,Djokovic,Nadal,Djokovic,out,first,4,4.0,5.45,30.23,"2:0, 30:0"
11,17,18,Djokovic,Nadal,Djokovic,out,first,6,6.4,8.55,22.48,"2:0, 40:0"


In [8]:
ARG_ACE_NADAL = df_first_serve_nadal["reason"] == "ace"
df_nadal_ace = df_first_serve_nadal[ARG_ACE_NADAL]
df_nadal_ace

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
67,97,98,Nadal,Djokovic,Nadal,ace,first,1,0.0,-1.11,26.94,"6:3 2:2, 30:15"
111,162,163,Nadal,Djokovic,Nadal,ace,first,1,0.0,11.29,25.49,"6:3 6:2 3:1, 0:30"
124,179,180,Nadal,Djokovic,Nadal,ace,first,1,0.0,4.62,6.79,"6:3 6:2 4:2, 15:15"


In [12]:
ARG_ACE_DJOK = df_first_serve_djok["reason"] == "ace"
df_djok_ace = df_first_serve_djok[ARG_ACE_DJOK]
df_djok_ace

Unnamed: 0.1,Unnamed: 0,rallyid,server,returner,winner,reason,serve,strokes,totaltime,x,y,score
12,18,19,Djokovic,Nadal,Djokovic,ace,first,1,0.0,1.62,10.23,"3:0, 0:0"
21,30,31,Djokovic,Nadal,Djokovic,ace,first,1,0.0,4.63,16.1,"3:1, 15:0"
23,32,33,Djokovic,Nadal,Djokovic,ace,first,1,0.0,4.23,18.13,"3:1, 40:0"
52,77,78,Djokovic,Nadal,Djokovic,ace,first,1,0.0,2.91,29.27,"6:3 0:1, 40:0"
86,125,126,Djokovic,Nadal,Djokovic,ace,first,1,0.0,1.78,8.86,"6:3 5:2, 30:0"
87,126,127,Djokovic,Nadal,Djokovic,ace,first,1,0.0,4.61,7.91,"6:3 5:2, 40:0"
88,127,128,Djokovic,Nadal,Djokovic,ace,first,1,0.0,1.4,8.41,"6:3 6:2 0:0, 0:0"


## Visualisation

In [18]:
# # Bar chart  #1 (simple): Categorical variables showing counts
# sns.countplot(x="server", palette="spring", data=df_serves)