In [1]:
# imports 
import pandas as pd
from typing import Dict

In [2]:
# data ingestion 
df_players = pd.read_csv("../data/players.csv")
df_players.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


In [3]:
# basic info
df_players.info()
# missing values
df_players.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nflId        1697 non-null   int64 
 1   height       1697 non-null   object
 2   weight       1697 non-null   int64 
 3   birthDate    1210 non-null   object
 4   collegeName  1697 non-null   object
 5   position     1697 non-null   object
 6   displayName  1697 non-null   object
dtypes: int64(2), object(5)
memory usage: 92.9+ KB


nflId            0
height           0
weight           0
birthDate      487
collegeName      0
position         0
displayName      0
dtype: int64

In [4]:
# dataset dimensions
df_players.shape

(1697, 7)

In [5]:
# unique values
df_players.nunique()

nflId          1697
height           16
weight          179
birthDate       989
collegeName     225
position         19
displayName    1687
dtype: int64

In [6]:
# unique heights
df_players['height'].unique()

array(['6-4', '6-2', '6-6', '5-10', '6-8', '6-3', '6-0', '6-5', '6-1',
       '5-9', '5-11', '5-8', '6-7', '6-9', '5-6', '5-7'], dtype=object)

In [27]:
# Define the height mapping dictionary
height_map: Dict[str, float] = {
    "4-10": 147.32,
    "4-11": 149.86,
    "5-0": 152.40,
    "5-1": 154.94,
    "5-2": 157.48,
    "5-3": 160.02,
    "5-4": 162.56,
    "5-5": 165.10,
    "5-6": 167.74,
    "5-7": 170.18,
    "5-8": 172.72,
    "5-9": 175.26,
    "5-10": 177.80,
    "5-11": 180.34,
    "6-0": 182.88,
    "6-1": 185.45,
    "6-2": 187.96,
    "6-3": 190.50,
    "6-4": 193.04,
    "6-5": 195.58,
    "6-6": 198.12,
    "6-7": 200.66,
    "6-8": 203.20
}

# Ensure the 'height' column is in string format, strip spaces, and map the conversions
df_players['height'] = df_players['height'].astype(str)
df_players['height_meters'] = df_players['height'].map(height_map)
df_players["height_meters"] = df_players["height_meters"].round(2).astype(float)

df_players["weight_kg"] = (df_players["weight"] * 0.453592).round(2).astype(float)

# Display the updated DataFrame
df_players.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName,height_meters,weight_kg
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,193.04,102.06
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters,193.04,148.78
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers,187.96,102.06
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis,198.12,121.11
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan,193.04,98.43


In [28]:
# group by position
count_players_by_position = df_players.groupby('position')['nflId'].count()
# find the average weight of each position
grouped_position = df_players.groupby('position')
mean_weight = grouped_position['weight_kg'].mean()
# find the average heiught of each position
mean_height = grouped_position['height_meters'].mean().round(2)
# join the two dataframes
df_mean = pd.concat([mean_weight, mean_height], axis=1)
df_mean.head(len(df_mean))




Unnamed: 0_level_0,weight_kg,height_meters
position,Unnamed: 1_level_1,Unnamed: 2_level_1
C,138.420328,192.87
CB,87.564392,182.58
DB,97.07,190.5
DE,123.640611,192.79
DT,138.870826,191.22
FB,112.556429,185.06
FS,92.022152,183.31
G,142.46243,194.27
ILB,106.59914,187.07
LB,114.76,195.58
