In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('nba.csv')
df.head(10)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,2-Jun,180,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99,SF,25,6-Jun,235,Marquette,6796117.0
2,John Holland,Boston Celtics,30,SG,27,5-Jun,205,Boston University,
3,R.J. Hunter,Boston Celtics,28,SG,22,5-Jun,185,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8,PF,29,10-Jun,231,,5000000.0
5,Amir Johnson,Boston Celtics,90,PF,29,9-Jun,240,,12000000.0
6,Jordan Mickey,Boston Celtics,55,PF,21,8-Jun,235,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41,C,25,Jul-00,238,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12,PG,22,2-Jun,190,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36,PG,22,4-Jun,220,Oklahoma State,3431040.0


In [4]:
df.isnull().sum()

Name         0
Team         0
Number       0
Position     0
Age          0
Height       0
Weight       0
College     84
Salary      11
dtype: int64

In [5]:
# Numerical nan values
num_nan = [feat for feat in df.columns if df[feat].dtypes != 'O' and df[feat].isnull().sum() > 0]

for feat in num_nan:
    print('{} has {} % nan values'.format(feat,np.round(df[feat].isnull().mean(),4))) 

Salary has 0.0241 % nan values


In [6]:
# Categorical nan values
cat_nan = [feat for feat in df.columns if df[feat].dtypes == 'O' and df[feat].isnull().sum() > 0]

for feat in cat_nan:
    print('{} has {} % nan values'.format(feat,np.round(df[feat].isnull().mean(),4))) 

College has 0.1838 % nan values


In [7]:
# Handling missing values for categorical values

for feat in cat_nan:
    df[feat] = df[feat].fillna('Missing')

df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,2-Jun,180,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99,SF,25,6-Jun,235,Marquette,6796117.0
2,John Holland,Boston Celtics,30,SG,27,5-Jun,205,Boston University,
3,R.J. Hunter,Boston Celtics,28,SG,22,5-Jun,185,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8,PF,29,10-Jun,231,Missing,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,20,10-Jun,234,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8,PG,26,3-Jun,203,Butler,2433333.0
454,Raul Neto,Utah Jazz,25,PG,24,1-Jun,179,Missing,900000.0
455,Tibor Pleiss,Utah Jazz,21,C,26,3-Jul,256,Missing,2900000.0


In [8]:
# Handling missing values for numerical values

for feat in num_nan:
    avg_val = np.round(df[feat].mean(),2)
    df[feat] = df[feat].fillna(avg_val)
    
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,2-Jun,180,Texas,7730337.00
1,Jae Crowder,Boston Celtics,99,SF,25,6-Jun,235,Marquette,6796117.00
2,John Holland,Boston Celtics,30,SG,27,5-Jun,205,Boston University,4842684.11
3,R.J. Hunter,Boston Celtics,28,SG,22,5-Jun,185,Georgia State,1148640.00
4,Jonas Jerebko,Boston Celtics,8,PF,29,10-Jun,231,Missing,5000000.00
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,20,10-Jun,234,Kentucky,2239800.00
453,Shelvin Mack,Utah Jazz,8,PG,26,3-Jun,203,Butler,2433333.00
454,Raul Neto,Utah Jazz,25,PG,24,1-Jun,179,Missing,900000.00
455,Tibor Pleiss,Utah Jazz,21,C,26,3-Jul,256,Missing,2900000.00


In [9]:
df['Age'] = np.where((df['Age'] >= 18) & (df['Age'] < 25),1,0)
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,0,2-Jun,180,Texas,7730337.00
1,Jae Crowder,Boston Celtics,99,SF,0,6-Jun,235,Marquette,6796117.00
2,John Holland,Boston Celtics,30,SG,0,5-Jun,205,Boston University,4842684.11
3,R.J. Hunter,Boston Celtics,28,SG,1,5-Jun,185,Georgia State,1148640.00
4,Jonas Jerebko,Boston Celtics,8,PF,0,10-Jun,231,Missing,5000000.00
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,1,10-Jun,234,Kentucky,2239800.00
453,Shelvin Mack,Utah Jazz,8,PG,0,3-Jun,203,Butler,2433333.00
454,Raul Neto,Utah Jazz,25,PG,1,1-Jun,179,Missing,900000.00
455,Tibor Pleiss,Utah Jazz,21,C,0,3-Jul,256,Missing,2900000.00


In [10]:
df.dtypes

Name         object
Team         object
Number        int64
Position     object
Age           int32
Height       object
Weight        int64
College      object
Salary      float64
dtype: object