In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
metrics = pd.read_csv("Downloads/metrics.csv")

In [4]:
metrics.head()

Unnamed: 0,athlete_id,date,metric,value
0,1,2016-05-01,hip_mobility,36
1,1,2016-05-02,hip_mobility,36
2,1,2016-05-03,hip_mobility,56
3,1,2016-05-04,hip_mobility,24
4,1,2016-05-05,hip_mobility,35


In [5]:
metrics.describe(include='all')

Unnamed: 0,athlete_id,date,metric,value
count,43800.0,43800,43800,43800.0
unique,,730,2,
top,,2018-04-30,hip_mobility,
freq,,60,21900,
mean,15.5,,,124.974338
std,8.65554,,,107.841666
min,1.0,,,-108.0
25%,8.0,,,39.0
50%,15.5,,,56.0
75%,23.0,,,213.0


In [6]:
metrics['date'] = metrics['date'].astype('datetime64[ns]')
metrics.dtypes

athlete_id             int64
date          datetime64[ns]
metric                object
value                  int64
dtype: object

In [7]:
workload = pd.read_csv("Downloads/game_workload.csv")

In [8]:
workload.head()

Unnamed: 0,athlete_id,date,game_workload
0,1,2016-05-05,402
1,1,2016-05-08,365
2,1,2016-05-11,457
3,1,2016-05-16,405
4,1,2016-05-20,407


In [9]:
workload.describe(include='all')

Unnamed: 0,athlete_id,date,game_workload
count,2400.0,2400,2400.0
unique,,706,
top,,2017-03-26,
freq,,9,
mean,15.649167,,400.481667
std,8.735438,,47.991824
min,1.0,,225.0
25%,8.0,,368.0
50%,16.0,,400.0
75%,23.0,,433.25


In [10]:
workload['date'] = workload['date'].astype('datetime64[ns]')
workload.dtypes

athlete_id                int64
date             datetime64[ns]
game_workload             int64
dtype: object

In [11]:
injuries = pd.read_csv("Downloads/injuries.csv")

In [12]:
injuries.head()

Unnamed: 0,athlete_id,date
0,1,2016-05-11
1,1,2016-05-16
2,1,2016-07-28
3,1,2016-11-11
4,1,2016-12-16


In [13]:
injuries.describe(include = 'all')

Unnamed: 0,athlete_id,date
count,137.0,137
unique,,126
top,,2016-05-16
freq,,4
mean,15.605839,
std,9.653068,
min,1.0,
25%,6.0,
50%,18.0,
75%,24.0,


In [14]:
injuries['date'] = injuries['date'].astype('datetime64[ns]')
injuries.dtypes

athlete_id             int64
date          datetime64[ns]
dtype: object

In [15]:
"""Creating a new column named injury with all values as yes"""
injuries["injury"] = "Yes"

In [16]:
injuries.head()

Unnamed: 0,athlete_id,date,injury
0,1,2016-05-11,Yes
1,1,2016-05-16,Yes
2,1,2016-07-28,Yes
3,1,2016-11-11,Yes
4,1,2016-12-16,Yes


In [17]:
"""Merging the workload and injuries dataframes"""
games_data = pd.merge(workload,injuries,  how='left', left_on=['athlete_id','date'], right_on = ['athlete_id','date'])

In [18]:
games_data["injury"].fillna("No", inplace = True)
#playersData["game_workload"].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  games_data["injury"].fillna("No", inplace = True)


In [19]:
games_data.head()

Unnamed: 0,athlete_id,date,game_workload,injury
0,1,2016-05-05,402,No
1,1,2016-05-08,365,No
2,1,2016-05-11,457,Yes
3,1,2016-05-16,405,Yes
4,1,2016-05-20,407,No


In [20]:
metrics.head()

Unnamed: 0,athlete_id,date,metric,value
0,1,2016-05-01,hip_mobility,36
1,1,2016-05-02,hip_mobility,36
2,1,2016-05-03,hip_mobility,56
3,1,2016-05-04,hip_mobility,24
4,1,2016-05-05,hip_mobility,35


In [21]:
"""Recording metric type as two seperate columns i.e transposing the groin_squeeze and hip_mobility rows into columns"""
new_metrics_df = metrics.pivot_table('value', ['athlete_id', 'date'], 'metric').reset_index()

In [22]:
new_metrics_df.head()

metric,athlete_id,date,groin_squeeze,hip_mobility
0,1,2016-05-01,297.0,36.0
1,1,2016-05-02,274.0,36.0
2,1,2016-05-03,291.0,56.0
3,1,2016-05-04,260.0,24.0
4,1,2016-05-05,284.0,35.0


In [23]:
new_metrics_df.shape

(21900, 4)

In [24]:
final_data = pd.merge(games_data,new_metrics_df,  how='left', left_on=['athlete_id','date'], right_on = ['athlete_id','date'])

In [25]:
final_data.shape

(2400, 6)

In [26]:
final_data.head()

Unnamed: 0,athlete_id,date,game_workload,injury,groin_squeeze,hip_mobility
0,1,2016-05-05,402,No,284.0,35.0
1,1,2016-05-08,365,No,250.0,41.0
2,1,2016-05-11,457,Yes,331.0,33.0
3,1,2016-05-16,405,Yes,260.0,38.0
4,1,2016-05-20,407,No,378.0,60.0


In [27]:
final_data['rest_period'] = final_data.groupby('athlete_id')['date'].diff()

In [28]:
final_data.head()

Unnamed: 0,athlete_id,date,game_workload,injury,groin_squeeze,hip_mobility,rest_period
0,1,2016-05-05,402,No,284.0,35.0,NaT
1,1,2016-05-08,365,No,250.0,41.0,3 days
2,1,2016-05-11,457,Yes,331.0,33.0,3 days
3,1,2016-05-16,405,Yes,260.0,38.0,5 days
4,1,2016-05-20,407,No,378.0,60.0,4 days


In [29]:

first_day = '2016-05-01'
date_object = pd.to_datetime(first_day)

In [30]:
final_data["rest_period"].fillna(final_data['date'] - date_object, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_data["rest_period"].fillna(final_data['date'] - date_object, inplace = True)


In [31]:
final_data.head()

Unnamed: 0,athlete_id,date,game_workload,injury,groin_squeeze,hip_mobility,rest_period
0,1,2016-05-05,402,No,284.0,35.0,4 days
1,1,2016-05-08,365,No,250.0,41.0,3 days
2,1,2016-05-11,457,Yes,331.0,33.0,3 days
3,1,2016-05-16,405,Yes,260.0,38.0,5 days
4,1,2016-05-20,407,No,378.0,60.0,4 days


In [33]:
"""Converting the column into day format"""
final_data['rest_period'] = final_data['rest_period'].astype('timedelta64[ns]')

In [34]:
final_data.head()

Unnamed: 0,athlete_id,date,game_workload,injury,groin_squeeze,hip_mobility,rest_period
0,1,2016-05-05,402,No,284.0,35.0,4 days
1,1,2016-05-08,365,No,250.0,41.0,3 days
2,1,2016-05-11,457,Yes,331.0,33.0,3 days
3,1,2016-05-16,405,Yes,260.0,38.0,5 days
4,1,2016-05-20,407,No,378.0,60.0,4 days


## Statistical Analysis
Preparing the dataframe for statistical analysis/ machine learning by adding/removing the categorical variables

In [36]:
final_data.injury.replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
final_data = final_data[['injury','athlete_id','date','game_workload','groin_squeeze','hip_mobility','rest_period']]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_data.injury.replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
  final_data.injury.replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)


In [37]:
final_data.head()

Unnamed: 0,injury,athlete_id,date,game_workload,groin_squeeze,hip_mobility,rest_period
0,0,1,2016-05-05,402,284.0,35.0,4 days
1,0,1,2016-05-08,365,250.0,41.0,3 days
2,1,1,2016-05-11,457,331.0,33.0,3 days
3,1,1,2016-05-16,405,260.0,38.0,5 days
4,0,1,2016-05-20,407,378.0,60.0,4 days


In [38]:
"""Creating dummy variables for categorical Athelete Ids"""
dummy_variables = pd.get_dummies(final_data['athlete_id'])
ready_data = pd.concat([final_data,dummy_variables], axis=1)

In [39]:
ready_data.head()

Unnamed: 0,injury,athlete_id,date,game_workload,groin_squeeze,hip_mobility,rest_period,1,2,3,...,21,22,23,24,25,26,27,28,29,30
0,0,1,2016-05-05,402,284.0,35.0,4 days,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,1,2016-05-08,365,250.0,41.0,3 days,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,1,2016-05-11,457,331.0,33.0,3 days,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1,2016-05-16,405,260.0,38.0,5 days,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,1,2016-05-20,407,378.0,60.0,4 days,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
ready_data.drop('athlete_id', axis = 1, inplace= True)
ready_data.drop('date', axis = 1, inplace = True)

In [41]:
ready_data.head()

Unnamed: 0,injury,game_workload,groin_squeeze,hip_mobility,rest_period,1,2,3,4,5,...,21,22,23,24,25,26,27,28,29,30
0,0,402,284.0,35.0,4 days,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,365,250.0,41.0,3 days,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,457,331.0,33.0,3 days,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,405,260.0,38.0,5 days,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,407,378.0,60.0,4 days,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
