# Tennis analysis

### Question or problem definition

We want to see if we can create a prediction model that can predict whether a player will win a match or not, based on different statistics

### Acquire training and testing data

import librairies :

In [4]:
# dataframe manipulation
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Acquire data from csv :

We want to collect a list of matches with the following information :
- match_id
- date
- surface
- best_of (3 sets win => best_of = 5)
- winner_rank
- loser_rank
- winner_rank_points
- loser_rank_points
- winner_elo_rating
- loser_elo_rating
- winner_age
- loser_age
- winner_height
- loser_height

>SELECT match_id, date, surface, best_of, winner_rank, loser_rank, winner_rank_points, loser_rank_points, winner_elo_rating, loser_elo_rating, winner_age, loser_age, winner_height, loser_height FROM match;

We want to add stats before matches to it :

For example if we want to get aces from nadal given a date we have to do :
> SELECT AVG(ps.p_ace), AVG(ps.o_ace) FROM (SELECT p_matches, p_ace, o_matches, o_ace, match_id, date, player_id FROM player_match_stats_v WHERE player_id=4742 AND date <= '2016/01/01') ps;

final statement to get 2018 matches with their stats :

>SELECT pms.match_id, pms.date, pms.surface, pms.level, pms.round, pms.best_of, pms.player_id, pms.opponent_id, pms.player_rank, pms.opponent_rank, pms.player_elo_rating, pms.opponent_elo_rating, pms.player_age, pms.opponent_age, pms.player_height, pms.opponent_height, pms.outcome, pms.p_matches, pms.o_matches, pms.p_sets, pms.o_sets, pms.p_games, pms.o_games, pms.p_tbs, pms.o_tbs, pms.p_ace, pms.p_df, pms.p_sv_pt, pms.p_1st_in, pms.p_1st_won, pms.p_2nd_won, pms.p_sv_gms, pms.p_bp_sv, pms.p_bp_fc, pms.o_ace, pms.o_df, pms.o_sv_pt, pms.o_1st_in, pms.o_1st_won, pms.o_2nd_won, pms.o_sv_gms, pms.o_bp_sv, pms.o_bp_fc, pms.minutes, pms.p_upsets, pms.o_upsets, ps.p_avg_ace, ps.p_avg_df, ps.p_avg_sv_pt, ps.p_avg_1st_in, ps.p_avg_1st_won, ps.p_avg_2nd_won, ps.p_avg_sv_gms, ps.p_avg_bp_sv, ps.p_avg_bp_fc, ps.o_avg_ace, ps.o_avg_df, ps.o_avg_sv_pt, ps.o_avg_1st_in, ps.o_avg_1st_won, ps.o_avg_2nd_won, ps.o_avg_sv_gms, ps.o_avg_bp_sv, ps.o_avg_bp_fc FROM player_match_stats_v pms LEFT JOIN LATERAL (SELECT ROUND(AVG(p_ace), 2) p_avg_ace, ROUND(AVG(p_df), 2) p_avg_df, ROUND(AVG(p_sv_pt), 2) p_avg_sv_pt, ROUND(AVG(p_1st_in), 2) p_avg_1st_in, ROUND(AVG(p_1st_won), 2) p_avg_1st_won, ROUND(AVG(p_2nd_won), 2) p_avg_2nd_won, ROUND(AVG(p_sv_gms), 2) p_avg_sv_gms, ROUND(AVG(p_bp_sv), 2) p_avg_bp_sv, ROUND(AVG(p_bp_fc), 2) p_avg_bp_fc, ROUND(AVG(o_ace), 2) o_avg_ace, ROUND(AVG(o_df), 2) o_avg_df, ROUND(AVG(o_sv_pt), 2) o_avg_sv_pt, ROUND(AVG(o_1st_in), 2) o_avg_1st_in, ROUND(AVG(o_1st_won), 2) o_avg_1st_won, ROUND(AVG(o_2nd_won), 2) o_avg_2nd_won, ROUND(AVG(o_sv_gms), 2) o_avg_sv_gms, ROUND(AVG(o_bp_sv), 2) o_avg_bp_sv, ROUND(AVG(o_bp_fc), 2) o_avg_bp_fc, player_id FROM player_match_stats_v WHERE date <= pms.date GROUP BY player_id) ps ON (pms.player_id=ps.player_id) WHERE pms.date >= '2018-01-01' AND pms.date <='2018-12-31';

In [5]:
tennis = pd.read_csv('2018 matches/matches.csv', delimiter=';')

In [6]:
tennis.head()

Unnamed: 0,match_id,date,surface,level,round,best_of,player_id,opponent_id,player_rank,opponent_rank,...,p_avg_bp_fc,o_avg_ace,o_avg_df,o_avg_sv_pt,o_avg_1st_in,o_avg_1st_won,o_avg_2nd_won,o_avg_sv_gms,o_avg_bp_sv,o_avg_bp_fc
0,166985,06/04/2018,C,D,RR,3,6030,11769,87.0,64.0,...,7.58,6.2,2.6,76.18,46.7,33.05,14.95,12.28,3.33,6.03
1,167084,09/04/2018,C,B,R32,3,6364,6196,26.0,65.0,...,6.69,7.55,3.74,82.57,49.97,36.63,16.29,12.95,4.18,6.77
2,167085,09/04/2018,C,B,R32,3,5420,4311,93.0,62.0,...,9.05,6.82,3.03,85.92,50.68,36.2,17.91,13.09,4.97,7.82
3,167086,09/04/2018,C,B,R32,3,4291,5902,91.0,61.0,...,7.98,7.96,3.22,80.63,48.48,35.95,16.59,12.79,4.2,6.56
4,167087,09/04/2018,C,B,R32,3,4994,3893,55.0,57.0,...,7.27,5.12,2.7,77.29,48.82,34.54,14.85,12.05,3.75,6.16


In [7]:
tennis.columns

Index(['match_id', 'date', 'surface', 'level', 'round', 'best_of', 'player_id',
       'opponent_id', 'player_rank', 'opponent_rank', 'player_elo_rating',
       'opponent_elo_rating', 'player_age', 'opponent_age', 'player_height',
       'opponent_height', 'outcome', 'p_matches', 'o_matches', 'p_sets',
       'o_sets', 'p_games', 'o_games', 'p_tbs', 'o_tbs', 'p_ace', 'p_df',
       'p_sv_pt', 'p_1st_in', 'p_1st_won', 'p_2nd_won', 'p_sv_gms', 'p_bp_sv',
       'p_bp_fc', 'o_ace', 'o_df', 'o_sv_pt', 'o_1st_in', 'o_1st_won',
       'o_2nd_won', 'o_sv_gms', 'o_bp_sv', 'o_bp_fc', 'minutes', 'p_upsets',
       'o_upsets', 'p_avg_ace', 'p_avg_df', 'p_avg_sv_pt', 'p_avg_1st_in',
       'p_avg_1st_won', 'p_avg_2nd_won', 'p_avg_sv_gms', 'p_avg_bp_sv',
       'p_avg_bp_fc', 'o_avg_ace', 'o_avg_df', 'o_avg_sv_pt', 'o_avg_1st_in',
       'o_avg_1st_won', 'o_avg_2nd_won', 'o_avg_sv_gms', 'o_avg_bp_sv',
       'o_avg_bp_fc'],
      dtype='object')

In [90]:
tennis[tennis['match_id'] == 166985]

Unnamed: 0,match_id,date,surface,level,round,best_of,player_id,opponent_id,player_rank,opponent_rank,...,p_avg_bp_fc,o_avg_ace,o_avg_df,o_avg_sv_pt,o_avg_1st_in,o_avg_1st_won,o_avg_2nd_won,o_avg_sv_gms,o_avg_bp_sv,o_avg_bp_fc
0,166985,06/04/2018,C,D,RR,3,6030,11769,87.0,64.0,...,7.58,6.2,2.6,76.18,46.7,33.05,14.95,12.28,3.33,6.03
390,166985,06/04/2018,C,D,RR,3,11769,6030,64.0,87.0,...,6.85,5.12,3.7,85.24,51.61,38.42,18.42,13.55,3.88,5.94


In [13]:
tennis['match_id'].nunique()

2943

In [67]:
tennis_non_duplicate = tennis.drop_duplicates(subset='match_id')

In [68]:
tennis_non_duplicate.shape

(2943, 64)

In [69]:
from sklearn.model_selection import train_test_split

In [77]:
train_df, test_df = train_test_split(tennis_non_duplicate)

In [78]:
print("train shape : ", train_df.shape)
print("train test : ", test_df.shape)

train shape :  (2207, 64)
train test :  (736, 64)


Save those to csv :

In [79]:
train_df.to_csv('2018 matches/tennis_matches_stats_train.csv', index=False)
test_df.to_csv('2018 matches/tennis_matches_stats_test.csv', index=False)

Declare new dataframes :

In [80]:
train_df = pd.read_csv('2018 matches/tennis_matches_stats_train.csv')
test_df = pd.read_csv('2018 matches/tennis_matches_stats_test.csv')

### Analyze by describing data

Which features are available in the dataset?

In [81]:
print(train_df.columns.values)

['match_id' 'date' 'surface' 'level' 'round' 'best_of' 'player_id'
 'opponent_id' 'player_rank' 'opponent_rank' 'player_elo_rating'
 'opponent_elo_rating' 'player_age' 'opponent_age' 'player_height'
 'opponent_height' 'outcome' 'p_matches' 'o_matches' 'p_sets' 'o_sets'
 'p_games' 'o_games' 'p_tbs' 'o_tbs' 'p_ace' 'p_df' 'p_sv_pt' 'p_1st_in'
 'p_1st_won' 'p_2nd_won' 'p_sv_gms' 'p_bp_sv' 'p_bp_fc' 'o_ace' 'o_df'
 'o_sv_pt' 'o_1st_in' 'o_1st_won' 'o_2nd_won' 'o_sv_gms' 'o_bp_sv'
 'o_bp_fc' 'minutes' 'p_upsets' 'o_upsets' 'p_avg_ace' 'p_avg_df'
 'p_avg_sv_pt' 'p_avg_1st_in' 'p_avg_1st_won' 'p_avg_2nd_won'
 'p_avg_sv_gms' 'p_avg_bp_sv' 'p_avg_bp_fc' 'o_avg_ace' 'o_avg_df'
 'o_avg_sv_pt' 'o_avg_1st_in' 'o_avg_1st_won' 'o_avg_2nd_won'
 'o_avg_sv_gms' 'o_avg_bp_sv' 'o_avg_bp_fc']


Categorical :
- Nominal :
    - Surface
- Ordinal :
    - o_matches
    - Level
    - p_matches

Numerical :
- Continous :
    - player_rank, opponent_rank
    - player_age, opponent_age
- Discrete
    - Best_of
    - player_elo_rating, opponent_elo_rating
    - player_height, opponent_height
    
Mixed data types :
- Date
- Round

In [91]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Data columns (total 64 columns):
match_id               2207 non-null int64
date                   2207 non-null object
surface                2207 non-null object
level                  2207 non-null object
round                  2207 non-null object
best_of                2207 non-null int64
player_id              2207 non-null int64
opponent_id            2207 non-null int64
player_rank            2202 non-null float64
opponent_rank          2189 non-null float64
player_elo_rating      2207 non-null int64
opponent_elo_rating    2207 non-null int64
player_age             2207 non-null float64
opponent_age           2204 non-null float64
player_height          1483 non-null float64
opponent_height        1296 non-null float64
outcome                50 non-null object
p_matches              2207 non-null int64
o_matches              2207 non-null int64
p_sets                 2207 non-null int64
o_sets            

Features contain blank, null or empty values :
- train :
    - opponent_height
    - player_height
    - outcome
- test :
    - opponent_height
    - player_height
    - outcome

### Verify how representative is training dataset

In [92]:
train_df.describe()

Unnamed: 0,match_id,best_of,player_id,opponent_id,player_rank,opponent_rank,player_elo_rating,opponent_elo_rating,player_age,opponent_age,...,p_avg_bp_fc,o_avg_ace,o_avg_df,o_avg_sv_pt,o_avg_1st_in,o_avg_1st_won,o_avg_2nd_won,o_avg_sv_gms,o_avg_bp_sv,o_avg_bp_fc
count,2207.0,2207.0,2207.0,2207.0,2202.0,2189.0,2207.0,2207.0,2207.0,2204.0,...,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0
mean,167540.609878,3.384232,8506.51246,9789.689624,84.681653,115.254454,1951.570005,1878.778885,27.714443,27.480259,...,6.681837,5.978014,3.034308,80.573252,49.119234,35.03824,15.931102,12.598077,4.127728,6.850766
std,854.439751,0.788106,8883.014754,10508.704583,163.937932,190.016918,179.000171,152.807228,4.682528,4.690043,...,1.314429,1.469343,0.494564,5.319698,3.522233,3.160558,1.518068,0.829204,0.652633,1.124125
min,166073.0,3.0,644.0,644.0,1.0,1.0,1463.0,1455.0,14.94,16.28,...,0.0,0.0,0.5,32.0,21.0,6.0,2.0,6.0,0.0,2.0
25%,166791.0,3.0,4868.0,4921.0,19.25,37.0,1844.0,1785.5,23.78,23.8875,...,5.85,4.98,2.8,78.52,47.49,33.37,15.23,12.25,3.81,6.17
50%,167543.0,3.0,5486.0,5630.0,48.0,68.0,1926.0,1882.0,28.04,27.635,...,6.66,5.84,2.97,80.42,48.83,34.92,15.85,12.53,4.14,6.84
75%,168267.5,3.0,6284.0,6364.0,88.0,113.0,2053.0,1957.0,31.23,30.8,...,7.48,6.98,3.14,82.56,50.58,36.35,16.61,12.93,4.46,7.48
max,169046.0,5.0,52643.0,52587.0,1821.0,1821.0,2472.0,2473.0,39.838467,39.838467,...,15.0,12.67,10.0,114.5,71.75,54.0,31.0,18.5,11.0,15.0


In [94]:
train_df[train_df['best_of']==3]['match_id'].count() / (train_df['match_id'].count())

0.8078840054372451

We have 80% of the matches that were in best of 3.

In [95]:
train_df['match_id'].count()

2207

We have 2943 matches in 2018

In [96]:
train_df.describe(include=['O'])

Unnamed: 0,date,surface,level,round,outcome
count,2207,2207,2207,2207,50
unique,44,3,6,8,1
top,15/01/2018,H,B,R32,RET
freq,101,1288,890,714,50


- 3 different values for surface (with top H)
- 6 different values for level (with top == B)
- There were 71 withdrawals

In [97]:
train_df[train_df['surface'] == 'H']['match_id'].count() / train_df['match_id'].count()

0.583597643860444

We have 58% of the matches that were in Hard

### Asumptions based on data analysis

- Correlating : We want to know how well does each feature correlate with Winning a game. We want to do this early in our project and match these quick correlations with modelled correlations later in the project.
- Completing : 
    - We may want to complete Age feature as it is definitely correlated to survival.
    - We may want to complete the Embarked feature as it may also correlate with survival or another important feature.
- Correcting : 
    - opponent_height and player_height features may be dropped as it is highly incomplete or contains many null values both in training and test dataset.
    - player_id and opponent_id features may be dropped from training dataset as it does not contribute to survival.
- Creating : 
    - We may want to create a new feature called p_serve based on ace...

- Classifying :


In [55]:
for dataset in train_df:
    dataset['ace_match'] = dataset['p_ace'] - dataset['o_ace']

TypeError: string indices must be integers