In [1]:
%matplotlib inline

# Load the bears dataset

In [4]:
df = pd.read_csv("bears.csv",sep=";")

In [5]:
df.head()

Unnamed: 0,AGE,MONTH,SEX,HEADLEN,HEADWTH,NECK,LENGTH,CHEST,WEIGHT
0,19,7,1,11.0,5.5,16.0,53.0,26.0,80
1,55,7,1,16.5,9.0,28.0,67.5,45.0,344
2,81,9,1,15.5,8.0,31.0,72.0,54.0,416
3,115,7,1,17.0,10.0,31.5,72.0,49.0,348
4,104,8,2,15.5,6.5,22.0,62.0,35.0,166


+ Let us print some descriptive statistics for each column

In [6]:
df.describe()

Unnamed: 0,AGE,MONTH,SEX,HEADLEN,HEADWTH,NECK,LENGTH,CHEST,WEIGHT
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,43.518519,8.407407,1.351852,12.953704,6.194444,20.555556,58.616667,35.662963,182.888889
std,33.720684,2.004886,0.482032,2.144032,1.512266,5.64071,10.700868,9.351729,121.801209
min,8.0,4.0,1.0,9.0,4.0,10.0,36.0,19.0,26.0
25%,17.0,8.0,1.0,11.5,5.0,16.625,50.5,29.0,87.0
50%,34.0,9.0,1.0,13.0,6.0,20.0,60.75,34.0,150.0
75%,57.75,10.0,2.0,14.5,7.0,24.0,66.125,43.5,232.0
max,177.0,11.0,2.0,17.0,10.0,31.5,76.5,55.0,514.0


You can notice that the scales of the different attributes are quite different, this could be problematic for algorithms relying on distance measures (e.g. K-means, K-NN).

Let us standardize the attributes scales, using the well-known Z-score transformation.
(3 ways explained here)

+ __Method 1: Using Pandas only__

In [11]:
df_zscore1 = (df - df.mean()) / df.std()

In [12]:
df_zscore1.describe()

Unnamed: 0,AGE,MONTH,SEX,HEADLEN,HEADWTH,NECK,LENGTH,CHEST,WEIGHT
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,-2.2615650000000002e-17,4.60537e-16,-5.345518e-17,-2.60594e-16,-1.768133e-16,-3.083953e-16,7.735582e-17,-9.560254000000001e-17,1.9531700000000002e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.053315,-2.198333,-0.7299344,-1.844051,-1.451096,-1.871317,-2.113536,-1.781806,-1.288073
25%,-0.786417,-0.2032073,-0.7299344,-0.6780234,-0.7898373,-0.6968193,-0.7585054,-0.7124846,-0.7872573
50%,-0.2822754,0.2955743,-0.7299344,0.0215931,-0.1285782,-0.09849036,0.1993608,-0.1778241,-0.270021
75%,0.4220401,0.7943558,1.344616,0.7212096,0.532681,0.6106402,0.7016564,0.8380308,0.4032071
max,3.958445,1.293137,1.344616,1.887237,2.516458,1.94026,1.671204,2.06775,2.718455


+ __Method 2: Using Scipy__

In [14]:
from scipy.stats import zscore

In [18]:
df_zscore2 = df.apply(zscore,axis=0)

In [19]:
df_zscore2.describe()

Unnamed: 0,AGE,MONTH,SEX,HEADLEN,HEADWTH,NECK,LENGTH,CHEST,WEIGHT
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,-1.2335810000000002e-17,3.577385e-16,-1.2335810000000002e-17,-2.117648e-16,-1.027984e-16,-2.446603e-16,3.340949e-17,-2.158767e-17,8.223874e-18
std,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939
min,-1.063206,-2.218976,-0.7367884,-1.861366,-1.464722,-1.888888,-2.133381,-1.798537,-1.300168
25%,-0.7938013,-0.2051154,-0.7367884,-0.6843899,-0.7972538,-0.7033623,-0.7656277,-0.7191748,-0.7946495
50%,-0.2849259,0.2983497,-0.7367884,0.02179586,-0.1297855,-0.09941517,0.2012327,-0.1794939,-0.2725565
75%,0.426003,0.8018147,1.357242,0.7279816,0.5376828,0.6163741,0.7082449,0.8458998,0.4069932
max,3.995615,1.30528,1.357242,1.904958,2.540088,1.958479,1.686896,2.087166,2.743981


+ __Method 3: Using sklearn__

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True,with_std=True)
ss.fit(df)
df_zscore3 = ss.transform(df)
df_zscore3 = pd.DataFrame(df_zscore3, index=df.index, columns=df.columns)

In [23]:
df_zscore3.describe()

Unnamed: 0,AGE,MONTH,SEX,HEADLEN,HEADWTH,NECK,LENGTH,CHEST,WEIGHT
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,-1.2335810000000002e-17,3.577385e-16,-1.2335810000000002e-17,-2.117648e-16,-1.027984e-16,-2.446603e-16,3.340949e-17,-2.158767e-17,8.223874e-18
std,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939,1.00939
min,-1.063206,-2.218976,-0.7367884,-1.861366,-1.464722,-1.888888,-2.133381,-1.798537,-1.300168
25%,-0.7938013,-0.2051154,-0.7367884,-0.6843899,-0.7972538,-0.7033623,-0.7656277,-0.7191748,-0.7946495
50%,-0.2849259,0.2983497,-0.7367884,0.02179586,-0.1297855,-0.09941517,0.2012327,-0.1794939,-0.2725565
75%,0.426003,0.8018147,1.357242,0.7279816,0.5376828,0.6163741,0.7082449,0.8458998,0.4069932
max,3.995615,1.30528,1.357242,1.904958,2.540088,1.958479,1.686896,2.087166,2.743981
