# Project 3 - Data Preprocess

In [214]:
import numpy as np
import pandas as pd

## NFL Injury Data

In [217]:
injury_data = pd.read_csv('/Users/omar/Desktop/DSCI 372M/Projects/project3/NFL_injuries_data.csv')
injury_data

Unnamed: 0,season,game_type,team,week,gsis_id,position,full_name,first_name,last_name,report_primary_injury,report_secondary_injury,report_status,practice_primary_injury,practice_secondary_injury,practice_status,date_modified
0,2009.0,REG,ARI,1.0,00-0022084,WR,Anquan Boldin,Anquan,Boldin,Hamstring,,Questionable,Hamstring,,Limited Participation in Practice,
1,2009.0,REG,ARI,1.0,00-0026221,WR,Early Doucet,Early,Doucet,Ribs,,Questionable,Ribs,,Limited Participation in Practice,
2,2009.0,REG,ARI,1.0,00-0022101,QB,Brian St. Pierre,Brian,St. Pierre,Back,,Questionable,Back,,Limited Participation in Practice,
3,2009.0,REG,ARI,1.0,00-0025529,WR,Steve Breaston,Steve,Breaston,Knee,,Probable,Knee,,Full Participation in Practice,
4,2009.0,REG,ARI,1.0,00-0022786,S,Matt Ware,Matt,Ware,Shoulder,,Probable,Shoulder,,Full Participation in Practice,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72865,2022.0,SB,PHI,22.0,00-0028129,C,Jason Kelce,Jason,Kelce,,,,Not injury related - resting player,,Limited Participation in Practice,2023-02-10 00:26:50+00:00
72866,2022.0,SB,PHI,22.0,00-0032954,G,Isaac Seumalo,Isaac,Seumalo,,,,Not injury related - resting player,,Limited Participation in Practice,2023-02-10 00:27:07+00:00
72867,2022.0,SB,PHI,22.0,00-0030062,CB,Darius Slay,Darius,Slay,,,,Not injury related - resting player,,Limited Participation in Practice,2023-02-10 00:27:38+00:00
72868,2022.0,SB,PHI,22.0,00-0034381,DE,Josh Sweat,Josh,Sweat,,,,Not injury related - resting player,,Limited Participation in Practice,2023-02-10 00:28:12+00:00


## Cleaning:

In [220]:
# Rename columns and drop unnecessary columns
injury_data_clean = injury_data.rename(columns={'season': 'year', 'full_name': 'player_name', 'report_primary_injury': 'report_injury'}
                                      ).drop(['first_name', 'last_name', 'report_secondary_injury', 'practice_secondary_injury', 
                                              'practice_primary_injury'], axis=1)
injury_data_clean

Unnamed: 0,year,game_type,team,week,gsis_id,position,player_name,report_injury,report_status,practice_status,date_modified
0,2009.0,REG,ARI,1.0,00-0022084,WR,Anquan Boldin,Hamstring,Questionable,Limited Participation in Practice,
1,2009.0,REG,ARI,1.0,00-0026221,WR,Early Doucet,Ribs,Questionable,Limited Participation in Practice,
2,2009.0,REG,ARI,1.0,00-0022101,QB,Brian St. Pierre,Back,Questionable,Limited Participation in Practice,
3,2009.0,REG,ARI,1.0,00-0025529,WR,Steve Breaston,Knee,Probable,Full Participation in Practice,
4,2009.0,REG,ARI,1.0,00-0022786,S,Matt Ware,Shoulder,Probable,Full Participation in Practice,
...,...,...,...,...,...,...,...,...,...,...,...
72865,2022.0,SB,PHI,22.0,00-0028129,C,Jason Kelce,,,Limited Participation in Practice,2023-02-10 00:26:50+00:00
72866,2022.0,SB,PHI,22.0,00-0032954,G,Isaac Seumalo,,,Limited Participation in Practice,2023-02-10 00:27:07+00:00
72867,2022.0,SB,PHI,22.0,00-0030062,CB,Darius Slay,,,Limited Participation in Practice,2023-02-10 00:27:38+00:00
72868,2022.0,SB,PHI,22.0,00-0034381,DE,Josh Sweat,,,Limited Participation in Practice,2023-02-10 00:28:12+00:00


In [222]:
# Recode numerical columns 'year' and 'week' from floats to integers
injury_data_clean[['year', 'week']] = injury_data_clean[['year', 'week']].astype(int)

# Convert 'date_modified' to datetime
injury_data_clean['date_modified'] = pd.to_datetime(injury_data_clean['date_modified'])

In [224]:
# Injury Format Cleaning --------------------------------------------------------------

# Convert all strings to lowercase and strip whitespace 
injury_data_clean['report_injury'] = injury_data_clean['report_injury'].str.lower().str.strip()

# Replace injuries with correct injury titles
injury_corrections = {'quadriceps': 'quadricep',
                      'right quadricep': 'quadricep',
                      'quad': 'quadricep', 
                      'right thigh': 'thigh', 
                      'left thigh': 'thigh',
                      'left quadricep': 'quadriceps',
                      'qblique': 'oblique',
                      'migraines': 'migraine',
                      'tricep': 'triceps',
                      'lf. calf': 'left calf',
                      'low back': 'lower back',
                      'rt. thumb': 'finger',
                      'right thumb': 'thumb',
                      'left thumb': 'thumb',
                      'right finger': 'finger',
                      'left ginger': 'finger',
                      'knees': 'knee',
                      'right knee': 'knee',
                      'left knee': 'knee',
                      'knee illness': 'knee',
                      'hips': 'hip',
                      'right hip': 'hip',
                      'left hip': 'hip',
                      'teeth': 'tooth',
                      'ankles': 'ankle',
                      'leg': 'lower leg',
                      'legs': 'lower leg',
                      'fibula': 'lower leg',
                      'tibia': 'lower leg',
                      'left calf': 'calf',
                      'right calf': 'calf',
                      'right hamstring': 'hamstring',
                      'left hamstring': 'hamstring',
                      'left groin': 'groin',
                      'right groin': 'groin',
                      'hands': 'hand',
                      'shoulders': 'shoulder',
                      'right shoulder': 'shoulder',
                      'r shoulder': 'shoulder',
                      'left shoulder': 'shoulder',
                      'left arm': 'arm',
                      'right arm': 'arm',
                      'l. arm': 'arm',
                      'forearm': 'arm',
                      'upper arm': 'arm',
                      'right upper arm': 'arm',
                      'left forearm': 'arm',
                      'right forearm': 'arm',
                      'right hand': 'hand',
                      'left hand': 'hand',
                      'left finger':'finger',
                      'toes': 'toe',
                      'left toe': 'toe',
                      'right toe': 'toe',
                      'rib cage': 'ribs',
                      'rib': 'ribs',
                      'right shin': 'shin',
                      'left shin': 'shin',
                      'appendix': 'appendicitis',
                      'kidney': 'kidneys',
                      'bicep': 'biceps',
                      'right biceps': 'biceps',
                      'back spasms': 'back',
                      'lower back': 'back',
                      'right elbow': 'elbow',
                      'left elbow': 'elbow',
                      'left wrist': 'wrist',
                      'right wrist': 'wrist',
                      'left foot': 'foot',
                      'right foot': 'foot',
                      'left ankle': 'ankle',
                      'right ankle': 'ankle',
                      'core muscle injury': 'core',
                      'core muscle': 'core',
                      'medical illness': 'illness',
                      'flu': 'illness',
                      'illness (non-covid)': 'illness',
                      '(migraines)': 'migraine',
                      'abdominal': 'abdomen',
                      'eye lid': 'eye',
                      'lacerated kidney': 'kidney',
                      'kidneys': 'kidney',
                      'pectoral': 'chest',
                      'arrhythmia': 'cardiac',
                      'lung contusion': 'lung',
                      'lumbar': 'back',
                      'trapezius': 'back',
                      'hip flexor': 'hip',
                      'abdomin': 'abdomen',
                      'both knees': 'knee',
                      'other-stinger': 'stinger',
                      'right collarbone': 'collarbone'  
                     }
injury_data_clean['report_injury'] = injury_data_clean['report_injury'].replace(injury_corrections)

# Replace non-injury values with 'not injury related'
non_injury_values = {'not injury related - discipline',
                     'not injury related - resting player',
                     'not injury related - did not travel',
                     'not injury related - other',
                     'non-football illness', 
                     'not injury related - personal matter',
                     'non football injury'
                    }
injury_data_clean['report_injury'] = injury_data_clean['report_injury'].replace(non_injury_values, 'not injury related')

# Replace multiple injury values with 'multiple injuries'
multiple_values = {'chest/finger',
                   'chest/ankle/illness', 
                   'chest/rib/finger',
                   'foot/toe/back', 
                   'ribs/ankle/toe',
                   'shoulder, finger',
                   'knee, hamstring',
                   'left shoulder, left elbow',
                   'groin, knee, ankle',
                   'ribs / shoulder / illness',
                   'knee/shoulder/toe',
                   'ankle, knee, elbow', 
                   'toe, pec, knee, hip'}
injury_data_clean['report_injury'] = injury_data_clean['report_injury'].replace(multiple_values, 'multiple injuries')

# Replace covid related values to 'covid'
covid_values = {'covid protocol', 
                'covid protocols', 
                'reserve/covid activation', 
                'covid/reserve', 
                'covid ramp up'}
injury_data_clean['report_injury'] = injury_data_clean['report_injury'].replace(covid_values, 'covid')

# Replace practice status with more precise titles
practice_status_corrections = {'Did Not Participate In Practice': 'Did Not Participate',
                          'Limited Participation in Practice': 'Limited Participation',
                          'Full Participation in Practice': 'Full Participation', 
                          'Out (Definitely Will Not Play)': 'Did Not Participate'}
injury_data_clean['practice_status'] = injury_data_clean['practice_status'].replace(practice_status_corrections)

injury_data_clean

Unnamed: 0,year,game_type,team,week,gsis_id,position,player_name,report_injury,report_status,practice_status,date_modified
0,2009,REG,ARI,1,00-0022084,WR,Anquan Boldin,hamstring,Questionable,Limited Participation,NaT
1,2009,REG,ARI,1,00-0026221,WR,Early Doucet,ribs,Questionable,Limited Participation,NaT
2,2009,REG,ARI,1,00-0022101,QB,Brian St. Pierre,back,Questionable,Limited Participation,NaT
3,2009,REG,ARI,1,00-0025529,WR,Steve Breaston,knee,Probable,Full Participation,NaT
4,2009,REG,ARI,1,00-0022786,S,Matt Ware,shoulder,Probable,Full Participation,NaT
...,...,...,...,...,...,...,...,...,...,...,...
72865,2022,SB,PHI,22,00-0028129,C,Jason Kelce,,,Limited Participation,2023-02-10 00:26:50+00:00
72866,2022,SB,PHI,22,00-0032954,G,Isaac Seumalo,,,Limited Participation,2023-02-10 00:27:07+00:00
72867,2022,SB,PHI,22,00-0030062,CB,Darius Slay,,,Limited Participation,2023-02-10 00:27:38+00:00
72868,2022,SB,PHI,22,00-0034381,DE,Josh Sweat,,,Limited Participation,2023-02-10 00:28:12+00:00


In [226]:
# Drop rows with NA columns
injury_data_clean = injury_data_clean.dropna(subset=['report_injury', 'report_status', 'date_modified'])

# Rearrange columns to desired order
injury_data_clean = injury_data_clean[['player_name', 'gsis_id', 'position', 'team', 'year', 'week', 'game_type',
                                      'report_injury', 'report_status', 'practice_status', 'date_modified']].reset_index(drop=True)
injury_data_clean

Unnamed: 0,player_name,gsis_id,position,team,year,week,game_type,report_injury,report_status,practice_status,date_modified
0,Lance Moore,00-0023310,WR,NO,2009,17,REG,ankle,Out,Did Not Participate,2010-01-01 09:23:14+00:00
1,Pierre Thomas,00-0024813,RB,NO,2009,17,REG,ribs,Out,Did Not Participate,2010-01-01 09:23:14+00:00
2,Usama Young,00-0025453,S,NO,2009,17,REG,abdomen,Out,Did Not Participate,2010-01-01 09:23:14+00:00
3,David Thomas,00-0024301,TE,NO,2009,17,REG,calf,Doubtful,Did Not Participate,2010-01-01 09:23:14+00:00
4,Scott Shanle,00-0022040,LB,NO,2009,17,REG,concussion,Doubtful,Limited Participation,2010-01-01 09:23:14+00:00
...,...,...,...,...,...,...,...,...,...,...,...
47580,Mecole Hardman,00-0035140,WR,KC,2022,21,CON,pelvis,Questionable,Limited Participation,2023-01-27 20:48:44+00:00
47581,Jimmy Garoppolo,00-0031345,QB,SF,2022,21,CON,foot,Out,Did Not Participate,2023-01-27 21:03:25+00:00
47582,Elijah Mitchell,00-0036567,RB,SF,2022,21,CON,groin,Questionable,Did Not Participate,2023-01-27 21:03:17+00:00
47583,Ambry Thomas,00-0037008,CB,SF,2022,21,CON,ankle,Questionable,Limited Participation,2023-01-27 21:03:17+00:00


## Train, Validation, Test Split:

In [239]:
# Reorder dataframe by 'date_modified'
injury_data_clean = injury_data_clean.sort_values(by='date_modified')

# Split points: 70% Train / 15% Validation / 15% Test
train_end = injury_data_clean['date_modified'].quantile(0.70)
val_end = injury_data_clean['date_modified'].quantile(0.85)

# Train, Validation, Test Dataframes
injury_train = injury_data_clean[injury_data_clean['date_modified'] <= train_end]
injury_val = injury_data_clean[(injury_data_clean['date_modified'] > train_end) & (injury_data_clean['date_modified'] <= val_end)]
injury_test = injury_data_clean[injury_data_clean['date_modified'] > val_end]

print(f"Training set size: {len(injury_train)}")
print(f"Validation set size: {len(injury_val)}")
print(f"Test set size: {len(injury_test)}")

Training set size: 33309
Validation set size: 7138
Test set size: 7138


In [241]:
# Save datasets as csv files
injury_train.to_csv('injury_train.csv', index=False)
injury_val.to_csv('injury_val.csv', index=False)
injury_test.to_csv('injury_test.csv', index=False)