In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import mstats

In [4]:
data = pd.read_csv('athlete_events.csv')

In [5]:
print(data.columns)

#4. print out the first 5 rows of the data to take a look the data
print(data.head())

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')
   ID                      Name Sex   Age  Height  Weight            Team  \
0   1                 A Dijiang   M  24.0   180.0    80.0           China   
1   2                  A Lamusi   M  23.0   170.0    60.0           China   
2   3       Gunnar Nielsen Aaby   M  24.0     NaN     NaN         Denmark   
3   4      Edgar Lindenau Aabye   M  34.0     NaN     NaN  Denmark/Sweden   
4   5  Christine Jacoba Aaftink   F  21.0   185.0    82.0     Netherlands   

   NOC        Games  Year  Season       City          Sport  \
0  CHN  1992 Summer  1992  Summer  Barcelona     Basketball   
1  CHN  2012 Summer  2012  Summer     London           Judo   
2  DEN  1920 Summer  1920  Summer  Antwerpen       Football   
3  DEN  1900 Summer  1900  Summer      Paris     Tug-Of-War   
4  NED  1988 Winter  1988  Winter    Calgary  Speed Skating 

In [6]:
data.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [7]:
def percent_null(df):
    return df.isnull().sum()*100/len(df)

In [8]:
percent_null(data)

ID         0.000000
Name       0.000000
Sex        0.000000
Age        3.494445
Height    22.193821
Weight    23.191180
Team       0.000000
NOC        0.000000
Games      0.000000
Year       0.000000
Season     0.000000
City       0.000000
Sport      0.000000
Event      0.000000
Medal     85.326207
dtype: float64

In [15]:
numeric_cols = data.select_dtypes(include=[np.number])
cat_cols = data.select_dtypes(include=[np.object])

In [16]:
percent_null(numeric_cols)

ID         0.000000
Age        3.494445
Height    22.193821
Weight    23.191180
Year       0.000000
dtype: float64

In [17]:
percent_null(cat_cols)

Name       0.000000
Sex        0.000000
Team       0.000000
NOC        0.000000
Games      0.000000
Season     0.000000
City       0.000000
Sport      0.000000
Event      0.000000
Medal     85.326207
dtype: float64

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [19]:
imp_mean = SimpleImputer(strategy='mean')

In [20]:
data_imp_mean = imp_mean.fit_transform(numeric_cols)

In [21]:
data_imp_mean = pd.DataFrame(data_imp_mean,columns = numeric_cols.columns)

In [29]:
percent_null(numeric_cols)

ID         0.000000
Age        3.494445
Height    22.193821
Weight    23.191180
Year       0.000000
dtype: float64

In [27]:
numeric_cols.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


In [28]:
data_imp_mean.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,271116.0,271116.0,271116.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.280857,9.2781,12.57469,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,22.0,170.0,63.0,1960.0
50%,68205.0,25.0,175.33897,70.702393,1988.0
75%,102097.25,28.0,180.0,75.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


In [30]:
imp_iter = IterativeImputer(max_iter=5,sample_posterior=True,random_state=122)

In [31]:
data_imp_iter = imp_iter.fit_transform(numeric_cols)

In [32]:
data_imp_iter = pd.DataFrame(data_imp_iter,columns = numeric_cols.columns,index=data.index)

In [33]:
data_imp_iter.head()

Unnamed: 0,ID,Age,Height,Weight,Year
0,1.0,24.0,180.0,80.0,1992.0
1,2.0,23.0,170.0,60.0,2012.0
2,3.0,24.0,184.153455,96.065758,1920.0
3,4.0,34.0,176.608324,74.595056,1900.0
4,5.0,21.0,185.0,82.0,1988.0


#### Data outliers and scaling

In [45]:
idx = (np.abs(stats.zscore(data_imp_iter)) < 3).all(axis=1)

In [56]:
mstats.winsorize(data_imp_iter['Age'],limits=[0.05,0.05])

masked_array(data=[24., 23., 24., ..., 27., 30., 34.],
             mask=False,
       fill_value=1e+20)

In [57]:
from sklearn.preprocessing import StandardScaler