# Player Injury Data Analysis - Section 4
Take injury and in-game feature sets of players to predict probability of injury in an upcoming game

This notebook assesses and visualises the impact of features on injury probability

In [None]:
import pandas as pd
import category_encoders as ce
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import distance as latlongdist
import os

In [None]:
os.chdir("..")
player_df = pd.read_csv('data/overview_data/all_players.csv')
games_df = pd.read_csv('data/overview_data/games_data.csv')
weather_df = pd.read_csv('data/overview_data/weather_data.csv')
player_bio_df = pd.read_csv('data/overview_data/player_bios.csv')

In [None]:
def get_weather_data(date, team_id):
    weather = weather_df[(weather_df['game_date'] == date) & ((weather_df['home_team_id'] == team_id) | (weather_df['away_team_id'] == team_id))]
    weather = weather[['attendance', 'latitude','longitude', 'elevation', 'temp', 'snow', 'windspeedMiles','winddirDegree', 'weatherCode', 'precipMM', 'humidity', 'visibility','pressure', 'cloudcover', 'HeatIndexC', 'WindChillC', 'WindGustMiles','FeelsLikeC']]
    team_games = weather_df[((weather_df['home_team_id'] == team_id) | (weather_df['away_team_id'] == team_id))].sort_values('game_date').reset_index(drop=True)
    i = team_games[team_games['game_date'] == date].index
    if i > 0:
        pointA = (team_games.loc[i-1,'latitude'].values[0], team_games.loc[i-1,'longitude'].values[0])
        pointB = (team_games.loc[i,'latitude'].values[0], team_games.loc[i,'longitude'].values[0])
        distance = latlongdist(pointA, pointB).km
    else:
        distance = 0
    weather['distance'] = distance
    weather['date'] = date
    return weather
    
def get_opponent_data(date, team_id):
    game = games_df[(games_df['game_date'] == date) & ((games_df['home_team_id'] == team_id) | (games_df['away_team_id'] == team_id))]
    if game['home_team_id'].values[0] == team_id:
        opponent = game['away_team_id'].values[0]
    else:
        opponent = game['home_team_id'].values[0]
    opponent_df = pd.read_csv('data/team_data/'+str(opponent)+'_features.csv')
    opponent_df = opponent_df.add_prefix('opp_')
    opponent_df = opponent_df.rename(columns={'opp_date': 'date'})
    opp_rolling_cols = ['opp_num_tackles', 'opp_num_fouls',
       'opp_num_bad_touches', 'opp_num_touches', 'opp_num_dribbles',
       'opp_num_times_tackled', 'opp_num_times_fouled','opp_goal_diff']
    for col in opp_rolling_cols:
        opponent_df[col] = opponent_df[col].ewm(com=0.25,min_periods=1,adjust=True).mean().shift(periods=1)#.rolling(window=5,closed='left',min_periods=1).mean()
    opponent_series = opponent_df[opponent_df['date'] == date]
    return opponent_series

##Change csv file name for different data sources
def get_injured_data(date, team_id, player_id):
    game = games_df[(games_df['game_date'] == date) & ((games_df['home_team_id'] == team_id) | (games_df['away_team_id'] == team_id))]
    injured_df = pd.read_csv('data/game_data/'+str(game['game_id'].values[0])+'/'+str(game['game_id'].values[0])+'_injured.csv')
    injured_series = injured_df[injured_df['player_id'] == player_id]
    injured_series = injured_series[['injured','injury_type']]
    return injured_series

#Collates all the different data aspects and put them together to form one feature set
def get_full_player_feature_set(player_id):
    player_data = pd.read_csv('data/player_data/'+str(player_id)+'/'+str(player_id)+'_gamefeatures_with_physical_updated.csv')
    player_injury_data = pd.read_csv('data/player_data/'+str(player_id)+'/'+str(player_id)+'_injuryfeatures.csv')
    merged_df = pd.merge(player_data, player_injury_data, on=['date','player_id','player_name','team_id'])
    series_list = []
    weather_list = []
    for i, row in merged_df.iterrows():
        opp_series = get_opponent_data(row['date'],row['team_id'])
        weather_series = get_weather_data(row['date'],row['team_id'])
        series_list.append(opp_series.squeeze())
        weather_list.append(weather_series.squeeze())
        
    opp_df = pd.concat(series_list,axis=1).T
    weather_df = pd.concat(weather_list,axis=1).T
    merged_df = pd.merge(merged_df,opp_df, on=['date'])
    merged_df = pd.merge(merged_df,weather_df, on=['date'])
    injury_series = []
    age_series = []
    height_series = []
    for i, row in merged_df.iterrows():
        injury_game_data = get_injured_data(row['date'],row['team_id'],row['player_id'])
        injury_series.append(injury_game_data.squeeze())
        height_series.append(player_bio_df[player_bio_df['player_id'] == row['player_id']]['height'].values[0])
        curr_date = datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
        born_date = datetime.strptime(player_bio_df[player_bio_df['player_id'] == row['player_id']]['dob'].values[0], '%Y-%m-%d')
        age = curr_date.year - born_date.year - ((curr_date.month,curr_date.day) < (born_date.month,born_date.day))
        age_series.append(age)
        
    injury_game_df = pd.concat(injury_series,axis=1).T.reset_index(drop=True)
    merged_df['height'] = height_series
    merged_df['age'] = age_series
    #merged_df['rolling_days_since_last_game'] = merged_df['days_since_last_game'].rolling(window=5,closed='left',min_periods=1).mean()
    merged_df['injured'] = injury_game_df['injured'].copy()
    merged_df['injured_type'] = injury_game_df['injury_type'].copy()
    return merged_df

In [None]:
player_feature_df = pd.DataFrame([])
cols_rolling = ['goal_diff','mins_played','num_tackles', 'num_fouls', 'num_bad_touches', 'num_ball_touches',
       'num_dribbles', 'num_tackleds', 'num_fouleds']
for p in player_df['player_id']:
    print(player_df[player_df['player_id']==p])
    curr_df = get_full_player_feature_set(p)
    curr_df['opp_team_home'] = curr_df['opp_team_home'].astype(bool)
    curr_df = curr_df.drop(['result','opp_result'], axis=1)
    print(curr_df[['date','num_tackles','mins_played','rolling_mins_played_exp','days_diff','rolling_days_diff_exp','num_injuries']])
    for col in cols_rolling:
        # Calculate the rolling average of the previous 4 rows for each row in the column
        curr_df[col] = curr_df[col].ewm(com=0.25,min_periods=1,adjust=True).mean().shift(periods=1)#.rolling(window=5,closed='left',min_periods=1).mean()
    print(curr_df)
    player_feature_df = pd.concat([player_feature_df, curr_df], ignore_index=True)
player_feature_df = player_feature_df.rename(columns={'pred_dist':'dist_covered'})

In [None]:
player_feature_df[['opp_num_tackles','opp_num_fouls','opp_num_bad_touches','opp_num_touches','opp_num_dribbles','opp_num_times_tackled','opp_num_times_fouled','opp_goal_diff','opp_matches_played','opp_days_since_last_game','opp_games_in_last_month']] = player_feature_df[['opp_num_tackles','opp_num_fouls','opp_num_bad_touches','opp_num_touches','opp_num_dribbles','opp_num_times_tackled','opp_num_times_fouled','opp_goal_diff','opp_matches_played','opp_days_since_last_game','opp_games_in_last_month']].astype(float)

## Data Analysis
Data has games from the 2017/18 and 2018/19 Premier League seasons

#### Step 1: All features

In [None]:
print("Features: ", player_feature_df.columns[:-2])
print("Target Variables: ", player_feature_df.columns[-2:])

## Step 2: Exploring the target variable

In [None]:
player_feature_df['injured'].describe()

In [None]:
player_feature_df['injured'].value_counts().plot(kind='bar')
print(player_feature_df['injured'].value_counts(normalize=True).mul(100).round(3).astype(str)+"%")

## Step 3: Explore relationship between variables

In [None]:
player_feature_df = player_feature_df[(player_feature_df['player_id'] != 3599) & (player_feature_df['player_id'] != 3702) & (player_feature_df['player_id'] != 3601)]
fig, axes = plt.subplots(11, 4, figsize=(20,40))
plt.rc('font', size=12)
ax = sns.boxplot(x="injured", y="mins_played", data=player_feature_df, orient='v', 
    ax=axes[0, 0])
ax = sns.boxplot(x="injured", y="matches_played", data=player_feature_df, orient='v', 
    ax=axes[0, 1])
ax = sns.boxplot(x="injured", y="num_tackles", data=player_feature_df, orient='v', 
    ax=axes[0, 2])
ax = sns.boxplot(x="injured", y="num_fouls", data=player_feature_df, orient='v', 
    ax=axes[0, 3])
ax = sns.boxplot(x="injured", y="num_bad_touches", data=player_feature_df, orient='v', 
    ax=axes[1, 0])
ax = sns.boxplot(x="injured", y="num_ball_touches", data=player_feature_df, orient='v', 
    ax=axes[1, 1])
ax = sns.boxplot(x="injured", y="num_dribbles", data=player_feature_df, orient='v', 
    ax=axes[1, 2])
ax = sns.boxplot(x="injured", y="num_tackleds", data=player_feature_df, orient='v', 
    ax=axes[1, 3])
ax = sns.boxplot(x="injured", y="num_fouleds", data=player_feature_df, orient='v', 
    ax=axes[2, 0])
ax = sns.boxplot(x="injured", y="days_diff", data=player_feature_df, orient='v', 
    ax=axes[2, 1])
ax = sns.boxplot(x="injured", y="rolling_mins_played_exp", data=player_feature_df, orient='v', 
    ax=axes[2, 2])
ax = sns.boxplot(x="injured", y="num_injuries", data=player_feature_df, orient='v', 
    ax=axes[2, 3])
ax = sns.boxplot(x="injured", y="total_days_out", data=player_feature_df, orient='v', 
    ax=axes[3, 0])
ax = sns.boxplot(x="injured", y="total_games_missed", data=player_feature_df, orient='v', 
    ax=axes[3, 1])
ax = sns.boxplot(x="injured", y="days_since_last_injury", data=player_feature_df, orient='v', 
    ax=axes[3, 2])
ax = sns.boxplot(x="injured", y="days_out_last_injury", data=player_feature_df, orient='v', 
    ax=axes[3, 3])
ax = sns.boxplot(x="injured", y="games_missed_last_injury", data=player_feature_df, orient='v', 
    ax=axes[4, 0])
ax = sns.boxplot(x="injured", y="frequency_most_prominent_injury", data=player_feature_df, orient='v', 
    ax=axes[4, 1])
ax = sns.boxplot(x="injured", y="days_out_most_prominent_injury", data=player_feature_df, orient='v', 
    ax=axes[4, 2])
ax = sns.boxplot(x="injured", y="games_missed_most_prominent_injury", data=player_feature_df, orient='v', 
    ax=axes[4, 3])
ax = sns.boxplot(x="injured", y="days_since_most_prominent_injury", data=player_feature_df, orient='v', 
    ax=axes[5, 0])
ax = sns.boxplot(x="injured", y="days_out_most_serious_injury", data=player_feature_df, orient='v', 
    ax=axes[5, 1])
ax = sns.boxplot(x="injured", y="games_missed_most_serious_injury", data=player_feature_df, orient='v', 
    ax=axes[5, 2])
ax = sns.boxplot(x="injured", y="days_since_most_serious_injury", data=player_feature_df, orient='v', 
    ax=axes[5, 3])
ax = sns.boxplot(x="injured", y="injuries_past_three_months", data=player_feature_df, orient='v', 
    ax=axes[6, 0])
ax = sns.boxplot(x="injured", y="injuries_past_six_months", data=player_feature_df, orient='v', 
    ax=axes[6, 1])
ax = sns.boxplot(x="injured", y="injuries_past_twelve_months", data=player_feature_df, orient='v', 
    ax=axes[6, 2])
ax = sns.boxplot(x="injured", y="rolling_days_diff_exp", data=player_feature_df, orient='v', 
    ax=axes[6, 3])
ax = sns.boxplot(x="injured", y="opp_num_tackles", data=player_feature_df, orient='v', 
    ax=axes[7, 0])
ax = sns.boxplot(x="injured", y="opp_num_fouls", data=player_feature_df, orient='v', 
    ax=axes[7, 1])
ax = sns.boxplot(x="injured", y="opp_num_bad_touches", data=player_feature_df, orient='v', 
    ax=axes[7, 2])
ax = sns.boxplot(x="injured", y="opp_num_touches", data=player_feature_df, orient='v', 
    ax=axes[7, 3])
ax = sns.boxplot(x="injured", y="distance", data=player_feature_df, orient='v', 
    ax=axes[8, 0])
ax = sns.boxplot(x="injured", y="temp", data=player_feature_df, orient='v', 
    ax=axes[8, 1])
ax = sns.boxplot(x="injured", y="windspeedMiles", data=player_feature_df, orient='v', 
    ax=axes[8, 2])
ax = sns.boxplot(x="injured", y="humidity", data=player_feature_df, orient='v', 
    ax=axes[8, 3])
ax = sns.boxplot(x="injured", y="dist_covered", data=player_feature_df, orient='v', 
    ax=axes[9, 0])
ax = sns.boxplot(x="injured", y="num_hirs", data=player_feature_df, orient='v', 
    ax=axes[9, 1])
ax = sns.boxplot(x="injured", y="num_sprints", data=player_feature_df, orient='v', 
    ax=axes[9, 2])
ax = sns.boxplot(x="injured", y="accels", data=player_feature_df, orient='v', 
    ax=axes[9, 3])
ax = sns.boxplot(x="injured", y="acute_workload", data=player_feature_df, orient='v', 
    ax=axes[10, 0])
ax = sns.boxplot(x="injured", y="chronic_workload", data=player_feature_df, orient='v', 
    ax=axes[10, 1])
ax = sns.boxplot(x="injured", y="ACWR", data=player_feature_df, orient='v', 
    ax=axes[10, 2])
ax = sns.boxplot(x="injured", y="decels", data=player_feature_df, orient='v', 
    ax=axes[10, 3])
fig.tight_layout()

In [None]:
player_feature_df['injured'] = player_feature_df['injured'].astype(int)

In [None]:
pp = sns.pairplot(player_feature_df[['dist_covered', 'metres_per_min', 'hir_dist',
       'sprint_dist', 'num_hirs', 'num_sprints', 'accels', 'decels',
       'LI_accels', 'LI_decels', 'acute_workload', 'chronic_workload', 'ACWR','injured']],kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['num_tackles',
       'num_fouls', 'num_bad_touches', 'num_ball_touches', 'num_dribbles',
       'num_tackleds', 'num_fouleds', 'injured']],kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['num_injuries', 'total_days_out',
       'total_games_missed', 'days_since_last_injury', 'days_out_last_injury', 'rolling_days_since_last_game_exp','rolling_mins_played_exp', 'injured']],kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
player_feature_df[['distance','temp','humidity', 'attendance', 'visibility']] = player_feature_df[['distance','temp','humidity', 'attendance', 'visibility']].astype(float)

In [None]:
pp = sns.pairplot(player_feature_df[['distance','temp','humidity', 'attendance', 'visibility', 'height','age','injured']],kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['frequency_most_prominent_injury', 'days_out_most_prominent_injury',
       'games_missed_most_prominent_injury',
       'days_since_most_prominent_injury', 'days_out_most_serious_injury',
       'games_missed_most_serious_injury',
       'days_since_most_serious_injury', 'injured']], kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['injuries_past_three_months',
       'injuries_past_six_months', 'injuries_past_twelve_months', 'days_since_last_game',
       'rolling_days_since_last_game_exp','injured']], kind='reg', plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['opp_num_tackles', 'opp_num_fouls',
       'opp_num_bad_touches', 'opp_num_touches', 'opp_num_dribbles',
       'opp_num_times_tackled', 'opp_num_times_fouled', 'injured']],kind='reg',plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
pp = sns.pairplot(player_feature_df[['opp_goal_diff','opp_matches_played', 'opp_days_since_last_game','opp_games_in_last_month', 'injured']],kind='reg',plot_kws={'line_kws':{'color':'red'}})
for ax in pp.axes.flat:
    if ax.get_ylabel() == 'injured':
        ax.set_ylim([0, 0.2])

In [None]:
player_feature_df.corr()

## PCA

In [None]:
from sklearn.preprocessing import StandardScaler
player_feature_df_wonan = player_feature_df[player_feature_df.columns[:-1]].dropna()
X=player_feature_df_wonan[['goal_diff',
       'mins_played', 'matches_played', 'num_tackles',
       'num_fouls', 'num_bad_touches', 'num_ball_touches', 'num_dribbles',
       'num_tackleds', 'num_fouleds', 'days_since_last_game','num_injuries', 'total_days_out',
       'total_games_missed', 'days_since_last_injury', 'days_out_last_injury',
       'games_missed_last_injury',
       'frequency_most_prominent_injury', 'days_out_most_prominent_injury',
       'games_missed_most_prominent_injury', 
       'days_since_most_prominent_injury', 'days_out_most_serious_injury',
       'games_missed_most_serious_injury', 
       'days_since_most_serious_injury', 'injuries_past_three_months',
       'injuries_past_six_months', 'injuries_past_twelve_months',
        'opp_num_tackles', 'opp_num_fouls',
       'opp_num_bad_touches', 'opp_num_touches', 'opp_num_dribbles',
       'opp_num_times_tackled', 'opp_num_times_fouled', 'opp_goal_diff',
       'opp_matches_played', 'opp_days_since_last_game','dist_covered','sprint_dist','accels','decels','hir_dist']]
y=player_feature_df_wonan[player_feature_df_wonan.columns[-1]]
x_scaled = StandardScaler().fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
 
pca = PCA(n_components=5)
 
pca_features = pca.fit_transform(x_scaled)
 
print('Shape before PCA: ', x_scaled.shape)
print('Shape after PCA: ', pca_features.shape)
 
pca_df = pd.DataFrame(
    data=pca_features)

In [None]:
pca.explained_variance_

In [None]:
# Principal components correlation coefficients
loadings = pca.components_
 
# Number of features before PCA
n_features = pca.n_features_
 
# Feature names before PCA
feature_names = X.columns
 
# PC names
pc_list = [f'PC{i}' for i in list(range(1, n_features + 1))]
 
# Match PC names to loadings
pc_loadings = dict(zip(pc_list, loadings))
 
# Matrix of corr coefs between feature names and PCs
loadings_df = pd.DataFrame.from_dict(pc_loadings)
loadings_df['feature_names'] = feature_names
loadings_df = loadings_df.set_index('feature_names')
loadings_df

abs_loadings = abs(loadings_df)
for pc in pc_loadings:
    print("Top 5 most influential variables for " + pc + ": ")
    print(abs_loadings.sort_values(pc, ascending=False).index[:5])