In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import seaborn as sns
import warnings

In [2]:
#US Open point-by-point
pts = pd.read_csv('./data/2019-usopen-points.csv')

In [3]:
pts.head()

Unnamed: 0,match_id,ElapsedTime,SetNo,P1GamesWon,P2GamesWon,SetWinner,GameNo,GameWinner,PointNumber,PointWinner,...,P2TurningPoint,ServeNumber,WinnerType,WinnerShotType,P1DistanceRun,P2DistanceRun,RallyCount,ServeWidth,ServeDepth,ReturnDepth
0,2019-usopen-1101,0:00:00,1,0,0,0,1,0,0X,0,...,,0,0,0,0.0,0.0,0,,,
1,2019-usopen-1101,0:00:00,1,0,0,0,1,0,0Y,0,...,,0,0,0,0.0,0.0,0,,,
2,2019-usopen-1101,0:00:00,1,0,0,0,1,0,1,1,...,,2,0,0,29.095,33.48,9,BC,NCTL,D
3,2019-usopen-1101,0:00:55,1,0,0,0,1,0,2,2,...,,0,0,0,0.577,0.51,0,,,
4,2019-usopen-1101,0:01:22,1,0,0,0,1,0,3,1,...,,1,0,0,14.642,8.791,5,C,NCTL,ND


In [4]:
pts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47893 entries, 0 to 47892
Data columns (total 65 columns):
match_id              47893 non-null object
ElapsedTime           47893 non-null object
SetNo                 47893 non-null int64
P1GamesWon            47893 non-null int64
P2GamesWon            47893 non-null int64
SetWinner             47893 non-null int64
GameNo                47893 non-null int64
GameWinner            47893 non-null int64
PointNumber           47893 non-null object
PointWinner           47893 non-null int64
PointServer           47893 non-null int64
Speed_KMH             47893 non-null int64
Rally                 0 non-null float64
P1Score               47893 non-null object
P2Score               47893 non-null object
P1Momentum            47893 non-null int64
P2Momentum            47893 non-null int64
P1PointsWon           47893 non-null int64
P2PointsWon           47893 non-null int64
P1Ace                 47893 non-null int64
P2Ace                 47893 

In [5]:
#columns where all values are null
cols_to_drop = ['Serve_Direction','Winner_FH','Winner_BH','ServingTo',
                'P1TurningPoint','P2TurningPoint','P1FirstSrvIn',
                'P2FirstSrvIn','P1FirstSrvWon','P2FirstSrvWon',
               'P1SecondSrvIn','P2SecondSrvIn','P1SecondSrvWon',
               'P2SecondSrvWon','P1ForcedError','P2ForcedError',
               'Rally',]

In [6]:
#drop columns where all values are null
for col in cols_to_drop:
    pts = pts.drop(col, axis=1)

In [7]:
#drop rows where no points are played
pts = pts[pts['PointWinner'] != 0]

In [8]:
#create unique id based on point number AND match id
pts['match_id_simplified'] = [str(i.split('-')[2]) for i in np.array(pts.match_id)]
type(pts.match_id_simplified)
pts['unique_id'] = np.array(pts.match_id_simplified.values)+"-"+np.array(pts.PointNumber.values)

In [9]:
pts['ServeNumber']=[2 if value==0 else value for value in pts['ServeNumber']]

In [10]:
#elminate matches where there is no information other than who wins each point
matches_to_eliminate = []
for group in set(pts['match_id_simplified']):
    zeroes = pts[pts['Speed_MPH']==0].groupby('match_id_simplified').groups[group]
    if len(zeroes) > 55:
        matches_to_eliminate.append(group)


In [11]:
for group in matches_to_eliminate:
    pts = pts[pts['match_id_simplified'] != group]

In [37]:
#adding gender feature
pts['gender'] = ['W' if int(m_id)>2100 else 'M' for m_id in pts['match_id_simplified']]

In [38]:
pts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28677 entries, 2 to 47892
Data columns (total 52 columns):
match_id               28677 non-null object
ElapsedTime            28677 non-null object
SetNo                  28677 non-null int64
P1GamesWon             28677 non-null int64
P2GamesWon             28677 non-null int64
SetWinner              28677 non-null int64
GameNo                 28677 non-null int64
GameWinner             28677 non-null int64
PointNumber            28677 non-null object
PointWinner            28677 non-null int64
PointServer            28677 non-null int64
Speed_KMH              28677 non-null int64
P1Score                28677 non-null object
P2Score                28677 non-null object
P1Momentum             28677 non-null int64
P2Momentum             28677 non-null int64
P1PointsWon            28677 non-null int64
P2PointsWon            28677 non-null int64
P1Ace                  28677 non-null int64
P2Ace                  28677 non-null int64
P1Winn

In [45]:
pts_test = pts.set_index('unique_id')

In [46]:
pts_test.head()

Unnamed: 0_level_0,match_id,ElapsedTime,SetNo,P1GamesWon,P2GamesWon,SetWinner,GameNo,GameWinner,PointNumber,PointWinner,...,WinnerShotType,P1DistanceRun,P2DistanceRun,RallyCount,ServeWidth,ServeDepth,ReturnDepth,match_id_simplified,gender,avg_speed_mph
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1101-1,2019-usopen-1101,0:00:00,1,0,0,0,1,0,1,1,...,0,29.095,33.48,9,BC,NCTL,D,1101,M,
1101-2,2019-usopen-1101,0:00:55,1,0,0,0,1,0,2,2,...,0,0.577,0.51,0,,,,1101,M,
1101-3,2019-usopen-1101,0:01:22,1,0,0,0,1,0,3,1,...,0,14.642,8.791,5,C,NCTL,ND,1101,M,
1101-4,2019-usopen-1101,0:01:46,1,0,0,0,1,0,4,2,...,0,8.739,13.514,6,C,NCTL,ND,1101,M,
1101-5,2019-usopen-1101,0:02:14,1,0,0,0,1,0,5,1,...,F,5.314,9.403,3,W,CTL,ND,1101,M,


In [47]:
pts_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28677 entries, 1101-1 to 2701-145
Data columns (total 51 columns):
match_id               28677 non-null object
ElapsedTime            28677 non-null object
SetNo                  28677 non-null int64
P1GamesWon             28677 non-null int64
P2GamesWon             28677 non-null int64
SetWinner              28677 non-null int64
GameNo                 28677 non-null int64
GameWinner             28677 non-null int64
PointNumber            28677 non-null object
PointWinner            28677 non-null int64
PointServer            28677 non-null int64
Speed_KMH              28677 non-null int64
P1Score                28677 non-null object
P2Score                28677 non-null object
P1Momentum             28677 non-null int64
P2Momentum             28677 non-null int64
P1PointsWon            28677 non-null int64
P2PointsWon            28677 non-null int64
P1Ace                  28677 non-null int64
P2Ace                  28677 non-null int64
P1W

In [51]:
pts_test[['Speed_MPH','ServeNumber','PointServer','PointNumber','match_id_simplified']]
pts_test.match_id_simplified.value_counts()

1123    372
1311    357
1108    347
1133    344
1145    343
1701    341
1155    326
1503    324
1310    321
1162    311
1313    303
1304    294
1231    292
1315    289
1225    280
1206    279
1502    275
1149    274
1105    270
1306    261
1407    258
1208    257
1154    254
1119    247
1302    247
1303    246
1230    245
1203    243
1501    240
1117    239
       ... 
2408    123
1403    122
2110    119
1150    114
2303    114
2160    113
2220    111
2133    110
2602    110
2304    110
2313    108
2156    108
2108    107
2143    106
2140    104
2116    103
2144    103
2201    103
2308    102
2311    102
2224    100
2301     91
2404     87
2134     87
2149     84
2309     83
2221     80
1132     68
2504     65
1217     52
Name: match_id_simplified, Length: 158, dtype: int64

In [13]:
matches = pd.read_csv('./data/2019-usopen-matches.csv')

In [None]:
matches[125:253]

In [117]:
pts_1['PointServer'][0]

1

In [145]:
#avg serve speed for all of the serves precdeding that point
#where serve is of the same type (i.e. first or second serve) 
#and where server == current server
pts_1['avg_speed_MPH'] = 0
pts_1['last_5_serves'] = 0
for match_id in set(pts_1['match_id_simplified']):
    for i,value in enumerate(pts_1['Speed_MPH']):
#         print(i)
#         print(pts_1['PointServer'][i])
        if pts_1['PointServer'][i] == 1:
            
            if pts_1['ServeNumber'][i] == 1:
#                 print(pts_1.iloc[0:i][(pts['PointServer']==1)&
#                                                  (pts_1['ServeNumber']==1)&
#                                                  (pts_1['match_id_simplified']==match_id)])
                moving_avg_serve_speed = np.mean(pts_1.iloc[0:i][(pts['PointServer']==1)&
                                                 (pts_1['ServeNumber']==1)&
                                                 (pts_1['match_id_simplified']==match_id)]['Speed_MPH'])
                pts_1['avg_speed_MPH'][i] = moving_avg_serve_speed
                
#                 last_5_serves = np.mean(pts[i-5:i][(pts['PointServer']==1)&
#                                                  (pts['ServeNumber']==1)&
#                                                  (pts['match_id_simplified']==match_id)]['Speed_MPH'])
#                 pts['last_5_serves'][i] = last_5_serves
                
            elif pts_1['ServeNumber'][i] == 2:
#                 print(pts_1.iloc[0:i])
#                 print(pts[pts['PointServer']==1])
            
#                 #print yhis query
#                 print('servenumber == 2')
#                 print(i)
#                 print(pts_1.iloc[0:3])

                moving_avg_serve_speed = np.mean(pts_1.iloc[0:i][(pts_1['PointServer']==1)&
                                                 (pts_1['ServeNumber']==2)&
                                                 (pts_1['match_id_simplified']==match_id)]['Speed_MPH'])
#                 print(moving_avg_serve_speed)
                pts_1['avg_speed_MPH'][i] = moving_avg_serve_speed
                
#                 last_5_serves = np.mean(pts[i-5:i][(pts['PointServer']==1)&
#                                                  (pts['ServeNumber']==2)&
#                                                  (pts['match_id_simplified']==match_id)]['Speed_MPH'])
#                 pts['last_5_serves'] = last_5_serves
        elif pts_1['PointServer'][i] == 2:
            if pts_1['ServeNumber'][i] == 1:
                moving_avg_serve_speed = np.mean(pts_1.iloc[0:i][(pts['PointServer']==2)&
                                                 (pts_1['ServeNumber']==1)&
                                                 (pts_1['match_id_simplified']==match_id)]['Speed_MPH'])
                pts_1['avg_speed_MPH'][i] = moving_avg_serve_speed
                
#                 last_5_serves = np.mean(pts[i-5:i][(pts['PointServer']==2)&
#                                                  (pts['ServeNumber']==1)&
#                                                  (pts['match_id_simplified']==match_id)]['Speed_MPH'])
#                 pts['last_5_serves'] = last_5_serves
                
            elif pts_1['ServeNumber'][i] == 2:
                moving_avg_serve_speed = np.mean(pts_1.iloc[0:i][(pts['PointServer']==2)&
                                                 (pts_1['ServeNumber']==2)&
                                                 (pts_1['match_id_simplified']==match_id)]['Speed_MPH'])
                pts_1['avg_speed_MPH'][i] = moving_avg_serve_speed
                
#                 last_5_serves = np.mean(pts[i-5:i][(pts['PointServer']==2)&
#                                                  (pts['ServeNumber']==2)&
#                                                  (pts['match_id_simplified']==match_id)]['Speed_MPH'])
#                 pts['last_5_serves'] = last_5_serves
                
                
                
                
                
                

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ValueError: cannot convert float NaN to integer

In [141]:
pts_1.iloc[2]

match_id               2019-usopen-1101
ElapsedTime                     0:01:22
SetNo                                 1
P1GamesWon                            0
P2GamesWon                            0
SetWinner                             0
GameNo                                1
GameWinner                            0
PointNumber                           3
PointWinner                           1
PointServer                           1
Speed_KMH                           191
P1Score                              30
P2Score                              15
P1Momentum                            3
P2Momentum                            1
P1PointsWon                           2
P2PointsWon                           1
P1Ace                                 0
P2Ace                                 0
P1Winner                              0
P2Winner                              0
P1DoubleFault                         0
P2DoubleFault                         0
P1UnfErr                              0


In [None]:
pts[['PointNumber','PointServer','ServeNumber','P1Ace','P2Ace','P1DoubleFault','P2DoubleFault','Speed_MPH']][pts['Speed_MPH']==0]

In [None]:
match_groups = pts.groupby(['match_id_simplified','PointServer','ServeNumber'])['Speed_MPH']
match_groups

In [None]:
#get unique values for categories of interest
def get_col_unique_vals(df, col):
        return set(df[col])

unique_col_vals = [get_col_unique_vals(pts, col)for col in ['match_id_simplified','PointServer','ServeNumber']]
unique_col_vals[1] 
range(len(unique_col_vals))
# match_ids = set(pts['match_id_simplified'])
# point_server_vals = set(pts['PointServer'])
# serve_number_vals = set(pts['ServeNumber'])

In [66]:
len(pts)
pts.iloc[2]

match_id               2019-usopen-1101
ElapsedTime                     0:01:22
SetNo                                 1
P1GamesWon                            0
P2GamesWon                            0
SetWinner                             0
GameNo                                1
GameWinner                            0
PointNumber                           3
PointWinner                           1
PointServer                           1
Speed_KMH                           191
P1Score                              30
P2Score                              15
P1Momentum                            3
P2Momentum                            1
P1PointsWon                           2
P2PointsWon                           1
P1Ace                                 0
P2Ace                                 0
P1Winner                              0
P2Winner                              0
P1DoubleFault                         0
P2DoubleFault                         0
P1UnfErr                              0


In [None]:
pts.groupby('match_ids')

In [None]:
master_avgs = pd.Series()

In [59]:
pts['PointServer']

2        1
3        1
4        1
5        1
6        1
7        1
8        2
9        2
10       2
11       2
12       2
13       2
14       2
15       2
16       2
17       2
18       2
19       2
20       2
21       2
22       2
23       2
24       1
25       1
26       1
27       1
28       2
29       2
30       2
31       2
        ..
47863    1
47864    1
47865    1
47866    2
47867    2
47868    2
47869    2
47870    2
47871    1
47872    1
47873    1
47874    1
47875    2
47876    2
47877    2
47878    2
47879    2
47880    2
47881    1
47882    1
47883    1
47884    1
47885    1
47886    1
47887    2
47888    2
47889    2
47890    2
47891    2
47892    2
Name: PointServer, Length: 28677, dtype: int64

In [55]:
serve_number_vals

{1, 2}

In [169]:
match_ids = set(pts_1['match_id_simplified'])
point_server_vals = set(pts_1['PointServer'])
serve_number_vals = set(pts_1['ServeNumber'])

master_series = []#pd.Series()
for a in match_ids:
    print(a)
    for b in point_server_vals:
        print(b)
        for c in serve_number_vals:
            print(c)
            df_grouped = pts_1.groupby(['match_id_simplified','PointServer','ServeNumber']).get_group((a,b,c))
            
#             avg_speeds_series = (pd.Series([df_grouped['Speed_MPH'].iloc[:i].mean() 
#                                                    for i in range(len(df_grouped))], 
#                                                   index = df_grouped.index))
            
            master_series.append(avg_speeds_series)
#             print(avg_speeds_series)
            

    
    


1160
1
1
2
2
1
2
1407
1
1
2
2
1
2
2139
1
1
2
2
1
2
2222
1
1
2
2
1
2
1216
1
1
2
2
1
2
2307
1
1
2
2
1
2
1205
1
1
2
2
1
2
1314
1
1
2
2
1
2
2209
1
1
2
2
1
2
2208
1
1
2
2
1
2
2126
1
1
2
2
1
2
2602
1
1
2
2
1
2
1141
1
1
2
2
1
2
1303
1
1
2
2
1
2
1157
1
1
2
2
1
2
1113
1
1
2
2
1
2
2121
1
1
2
2
1
2
2217
1
1
2
2
1
2
2225
1
1
2
2
1
2
2140
1
1
2
2
1
2
2230
1
1
2
2
1
2
1304
1
1
2
2
1
2
1162
1
1
2
2
1
2
1406
1
1
2
2
1
2
2231
1
1
2
2
1
2
1308
1
1
2
2
1
2
2109
1
1
2
2
1
2
2407
1
1
2
2
1
2
2501
1
1
2
2
1
2
1164
1
1
2
2
1
2
1102
1
1
2
2
1
2
2303
1
1
2
2
1
2
1201
1
1
2
2
1
2
1315
1
1
2
2
1
2
2311
1
1
2
2
1
2
2141
1
1
2
2
1
2
1701
1
1
2
2
1
2
1231
1
1
2
2
1
2
2314
1
1
2
2
1
2
2125
1
1
2
2
1
2
1307
1
1
2
2
1
2
1305
1
1
2
2
1
2
1115
1
1
2
2
1
2
2120
1
1
2
2
1
2
1228
1
1
2
2
1
2
2211
1
1
2
2
1
2
2306
1
1
2
2
1
2
1602
1
1
2
2
1
2
1404
1
1
2
2
1
2
2305
1
1
2
2
1
2
2214
1
1
2
2
1
2
2224
1
1
2
2
1
2
1218
1
1
2
2
1
2
2148
1
1
2
2
1
2
2403
1
1
2
2
1
2
2137
1
1
2
2
1
2
1149
1
1
2
2
1
2
1222
1
1
2
2
1
2
2204
1
1
2
2
1

In [179]:
df_grouped = pts_1.groupby(['match_id_simplified','PointServer','ServeNumber']).get_group(('1101',1,1))[['PointNumber',
                                                                                            'PointServer',
                                                                                            'ServeNumber',
                                                                                           'Speed_MPH']]
df_grouped

Unnamed: 0_level_0,PointNumber,PointServer,ServeNumber,Speed_MPH
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1101-3,3,1,1,119
1101-4,4,1,1,117
1101-6,6,1,1,118
1101-23,23,1,1,119
1101-24,24,1,1,81
1101-25,25,1,1,110
1101-32,32,1,1,121
1101-33,33,1,1,113
1101-35,35,1,1,120
1101-36,36,1,1,122


In [178]:
pd.Series([df_grouped['Speed_MPH'].iloc[:i].mean() 
                                                   for i in range(len(df_grouped))], 
                                                  index = df_grouped.index)

unique_id
1101-3             NaN
1101-4      119.000000
1101-6      118.000000
1101-23     118.000000
1101-24     118.250000
1101-25     110.800000
1101-32     110.666667
1101-33     112.142857
1101-35     112.250000
1101-36     113.111111
1101-37     114.000000
1101-44     114.000000
1101-47     114.166667
1101-57     114.769231
1101-59     115.000000
1101-61     115.400000
1101-68     115.687500
1101-69     115.352941
1101-83     115.388889
1101-84     114.315789
1101-85     114.600000
1101-90     114.714286
1101-91     114.863636
1101-93     115.086957
1101-104    115.166667
1101-106    115.280000
1101-113    115.423077
1101-114    115.518519
1101-115    115.678571
1101-123    115.413793
1101-124    115.066667
1101-125    115.096774
1101-127    115.187500
1101-128    115.000000
1101-135    114.235294
1101-136    113.885714
1101-138    113.972222
1101-140    114.027027
1101-148    114.184211
1101-149    114.230769
1101-150    114.150000
1101-151    114.195122
1101-162    113.904762
1

In [170]:
master_series

[unique_id
 1206-7            NaN
 1206-8      91.000000
 1206-12     92.500000
 1206-14     61.666667
 1206-23     46.250000
 1206-33     56.000000
 1206-34     46.666667
 1206-37     53.142857
 1206-43     59.125000
 1206-45     52.555556
 1206-46     56.500000
 1206-47     61.090909
 1206-57     64.750000
 1206-58     67.384615
 1206-59     69.785714
 1206-61     65.133333
 1206-62     67.375000
 1206-64     68.882353
 1206-70     70.500000
 1206-71     72.052632
 1206-73     68.450000
 1206-84     69.571429
 1206-85     71.227273
 1206-94     72.173913
 1206-96     69.166667
 1206-97     70.320000
 1206-98     71.269231
 1206-107    72.444444
 1206-109    73.535714
 1206-122    74.620690
               ...    
 1206-149    75.800000
 1206-151    73.694444
 1206-166    74.243243
 1206-167    72.289474
 1206-168    72.923077
 1206-169    73.625000
 1206-184    74.121951
 1206-185    74.523810
 1206-188    72.790698
 1206-189    71.136364
 1206-199    71.688889
 1206-200    72.130435


In [152]:
pts_1.iloc[1101-4]['Speed_MPH']

90

In [153]:
pts_1.iloc[1101-3]['Speed_MPH']

87

In [154]:
pts_1.iloc[1101-2]['Speed_MPH']

98

In [155]:
pts_1.iloc[1101-1]['Speed_MPH']

96

In [159]:
x = [87, 98, 96]
np.mean(x)

93.66666666666667

In [91]:
# searching 
type(master_series)
len(master_series)
master_series[45]

47649          NaN
47652    81.000000
47656    40.500000
47658    55.333333
47659    61.750000
47661    65.800000
47665    54.833333
47671    58.285714
47674    61.125000
47686    63.666667
47695    66.300000
47698    60.272727
47706    61.833333
47715    63.230769
47716    64.142857
47718    59.866667
47722    60.937500
47727    61.882353
47729    62.944444
47741    63.947368
47742    64.600000
47744    65.380952
dtype: float64

In [94]:
#resetting index
pts_1 = pts.set_index('unique_id')

In [103]:
#rerunning grouping with new dataframe
match_ids = set(pts_1['match_id_simplified'])
point_server_vals = set(pts_1['PointServer'])
serve_number_vals = set(pts_1['ServeNumber'])

master_series = []#pd.Series()
for a in match_ids:
#     print(a)
    for b in point_server_vals:
#         print(b)
        for c in serve_number_vals:
            df_grouped = pts_1.groupby(['match_id_simplified','PointServer','ServeNumber']).get_group((a,b,c))
            avg_speeds_series = (pd.Series([df_grouped['Speed_MPH'].iloc[:i].mean() 
                                                   for i in range(len(df_grouped))], 
                                                  index = pts_1.index))
            master_series.append(avg_speeds_series)
#

ValueError: Length of passed values is 51, index implies 28677

In [96]:
master_series

[unique_id
 1160-7             NaN
 1160-8      112.000000
 1160-10     116.500000
 1160-17     114.666667
 1160-18     116.500000
 1160-19     114.800000
 1160-20     114.166667
 1160-21     113.857143
 1160-27     114.500000
 1160-32     113.111111
 1160-33     111.900000
 1160-39     112.090909
 1160-40     112.416667
 1160-43     112.230769
 1160-50     112.785714
 1160-51     113.400000
 1160-52     113.812500
 1160-65     114.117647
 1160-67     114.333333
 1160-68     113.789474
 1160-71     114.000000
 1160-78     113.476190
 1160-81     113.636364
 1160-90     113.173913
 1160-101    113.125000
 1160-103    113.280000
 1160-109    112.961538
 1160-111    112.851852
 1160-114    112.892857
 1160-115    113.137931
 1160-116    113.033333
 1160-123    112.806452
 1160-124    112.718750
 1160-127    112.545455
 1160-134    112.088235
 1160-135    111.628571
 1160-138    111.416667
 1160-145    111.567568
 1160-146    111.552632
 1160-147    111.410256
 1160-148    111.600000
 1160

In [102]:
#looking at index 1160-7 in pts
pts_1.iloc[1160-7]

match_id               2019-usopen-1109
ElapsedTime                     1:39:35
SetNo                                 3
P1GamesWon                            2
P2GamesWon                            3
SetWinner                             0
GameNo                                6
GameWinner                            0
PointNumber                         139
PointWinner                           2
PointServer                           2
Speed_KMH                           217
P1Score                               0
P2Score                              15
P1Momentum                           56
P2Momentum                          172
P1PointsWon                          59
P2PointsWon                          80
P1Ace                                 0
P2Ace                                 1
P1Winner                              0
P2Winner                              1
P1DoubleFault                         0
P2DoubleFault                         0
P1UnfErr                              0


In [104]:
pts_1.index

Index(['1101-1', '1101-2', '1101-3', '1101-4', '1101-5', '1101-6', '1101-7',
       '1101-8', '1101-9', '1101-10',
       ...
       '2701-136', '2701-137', '2701-138', '2701-139', '2701-140', '2701-141',
       '2701-142', '2701-143', '2701-144', '2701-145'],
      dtype='object', name='unique_id', length=28677)

In [36]:
master_series = pd.Series()
for a in match_ids:
    for b in point_server_vals:
        for c in serve_number_vals:
            df_grouped = pts_test.groupby(['match_id_simplified','PointServer','ServeNumber']).get_group((a,b,c))
            avg_speeds_series = (pd.Series([df_grouped['Speed_MPH'].iloc[:i].mean() 
                                                   for i in range(len(df_grouped))], 
                                                  index = df_grouped.index))
            master_series.append(avg_speeds_series)

In [29]:
master_series.append(avg_speeds_series)
master_series

Series([], dtype: float64)

In [None]:
def get_rolling_avg(df, cols):
#     unique_col_vals = [get_col_unique_vals(pts, col)for col in cols]
    unique_col_vals = [get_col_unique_vals(pts, col)for col in ['match_id_simplified','PointServer','ServeNumber']]
#     for i in unique_col_vals:
    for col in unique_col_vals
        for j in unique_col_vals[i]:
            df_grouped = df.groupby(cols).get_group
    df_grouped = df.groupby(cols).get_

In [None]:
cols=['match_id_simplified', 'PointServer', 'ServeNumber']
def get_rolling_avg(df, cols):
    unique_col_vals = [get_col_unique_vals(pts, col)for col in cols]
    
    for i in unique_col_vals:
        for j in unique_col_vals[i]:
            df_grouped = df.groupby(cols).get_group
    df_grouped = df.groupby(cols).get_

In [None]:
b = pts.groupby(['match_id_simplified', 'PointServer', 'ServeNumber']).get_group(('1101', 2, 1))

In [None]:
pd.Series([b['Speed_MPH'].iloc[:i].mean() for i in range(len(b))], index = b.index)

In [None]:
pd.Series([b['Speed_MPH'].iloc[:i].mean() for i in range(len(b))], index = b.index)

In [None]:
match_groups = pts.groupby(['match_id_simplified','PointServer','ServeNumber'])
match_groups = pts.groupby(['match_id_simplified',
                                    'PointServer','ServeNumber']).get_group(('1102',
                                                                             1,1))['Speed_MPH'].iloc[:10]

avg = np.mean(pts.groupby(['match_id_simplified',
                                    'PointServer','ServeNumber']).get_group(('1102',
                                                                             1,1))['Speed_MPH'].iloc[:10])
print(match_groups)
print(avg)

In [None]:
pts_reduced = pts.iloc[5:10]
pts_reduced

match_groups = pts_reduced.groupby(['match_id_simplified',
                                    'PointServer','ServeNumber']).get_group(('1101',
                                                                             1,1))['Speed_MPH'].iloc[:9]

avg = np.mean(pts_reduced.groupby(['match_id_simplified',
                                    'PointServer','ServeNumber']).get_group(('1101',
                                                                             1,1))['Speed_MPH'].iloc[:9])
print(match_groups)
print(avg)

In [None]:
pts_reduced[['PointServer','PointNumber','ServeNumber','Speed_MPH']]

In [None]:
for index in pts_reduced.index:
    avg = np.mean(pts_reduced.groupby(['match_id_simplified',
                                        'PointServer',
                                        'ServeNumber']).get_group(('1101',1,1))['Speed_MPH'].loc[:index])
    print(index, avg)

In [None]:
pts[0:2][(pts['PointServer']==1)&(pts['ServeNumber']==1)&(pts['match_id_simplified']=='1101')]['Speed_MPH']

In [None]:
np.mean(pts[0:2][(pts['PointServer']==1)&(pts['ServeNumber']==2)&(pts['match_id_simplified']=='1101')]['Speed_MPH'])