In [1]:
# import essential libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [4]:
# load dataset
data = pd.read_csv("yeast.csv")
data.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class_protein_localization
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [6]:
# data transform in case of wrong format
data = data.ix[:, 0:]
data.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class_protein_localization
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [7]:
# check dataset information
data.shape

(1484, 9)

In [17]:
# features and labels extraction
features = data.ix[:, :-1]
labels = data.ix[:, -1]
print(features.head())
print(labels.head())

    mcg   gvh   alm   mit  erl  pox   vac   nuc
0  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22
1  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22
2  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22
3  0.58  0.44  0.57  0.13  0.5  0.0  0.54  0.22
4  0.42  0.44  0.48  0.54  0.5  0.0  0.48  0.22
0    MIT
1    MIT
2    MIT
3    NUC
4    MIT
Name: class_protein_localization, dtype: object


In [18]:
# feature preprocessing using scale
features_scaled_via_scale = preprocessing.scale(features)
features_scaled_via_scale = pd.DataFrame(features_scaled_via_scale)
print(features_scaled_via_scale.head())
print(features_scaled_via_scale.mean(axis=0))
print(features_scaled_via_scale.std(axis=0))

          0         1         2         3        4         5         6  \
0  0.581981  0.888481 -0.346645 -0.957203 -0.09759 -0.099131 -0.344175   
1 -0.510891  1.372811 -0.231226  0.064312 -0.09759 -0.099131  0.521219   
2  1.019130  0.969203 -0.115808 -0.811272 -0.09759 -0.099131  0.521219   
3  0.581981 -0.483786  0.807542 -0.957203 -0.09759 -0.099131  0.694298   
4 -0.583749 -0.483786 -0.231226  2.034375 -0.09759 -0.099131 -0.344175   

          7  
0 -0.527919  
1 -0.527919  
2 -0.527919  
3 -0.527919  
4 -0.527919  
0    2.681293e-16
1    5.910217e-17
2    1.043640e-17
3    3.418948e-17
4   -1.415366e-15
5    4.263399e-16
6    1.263141e-15
7   -2.269823e-16
dtype: float64
0    1.000337
1    1.000337
2    1.000337
3    1.000337
4    1.000337
5    1.000337
6    1.000337
7    1.000337
dtype: float64


In [19]:
# feature preprocessing using minmaxscaler
min_max_scaler =  preprocessing.MinMaxScaler()
features_scaled_via_minmaxscaler = min_max_scaler.fit_transform(features)
features_scaled_via_minmaxscaler = pd.DataFrame(features_scaled_via_minmaxscaler)
print(features_scaled_via_minmaxscaler.head())
print(features_scaled_via_minmaxscaler.mean(axis=0))
print(features_scaled_via_minmaxscaler.std(axis=0))

          0         1         2     3    4    5         6     7
0  0.528090  0.551724  0.329114  0.13  0.0  0.0  0.657534  0.22
1  0.359551  0.620690  0.341772  0.27  0.0  0.0  0.726027  0.22
2  0.595506  0.563218  0.354430  0.15  0.0  0.0  0.726027  0.22
3  0.528090  0.356322  0.455696  0.13  0.0  0.0  0.739726  0.22
4  0.348315  0.356322  0.341772  0.54  0.0  0.0  0.657534  0.22
0    0.438339
1    0.425210
2    0.367131
3    0.261186
4    0.009434
5    0.009036
6    0.684775
7    0.276199
dtype: float64
0    0.154269
1    0.142442
2    0.109709
3    0.137098
4    0.096702
5    0.091184
6    0.079173
7    0.106491
dtype: float64


In [20]:
# feature preprocessing using maxabsscaler
max_abs_scaler =  preprocessing.MaxAbsScaler()
features_scaled_via_maxabsscaler = max_abs_scaler.fit_transform(features)
features_scaled_via_maxabsscaler = pd.DataFrame(features_scaled_via_maxabsscaler)
print(features_scaled_via_maxabsscaler.head())
print(features_scaled_via_maxabsscaler.mean(axis=0))
print(features_scaled_via_maxabsscaler.std(axis=0))

      0     1     2     3    4    5         6     7
0  0.58  0.61  0.47  0.13  0.5  0.0  0.657534  0.22
1  0.43  0.67  0.48  0.27  0.5  0.0  0.726027  0.22
2  0.64  0.62  0.49  0.15  0.5  0.0  0.726027  0.22
3  0.58  0.44  0.57  0.13  0.5  0.0  0.739726  0.22
4  0.42  0.44  0.48  0.54  0.5  0.0  0.657534  0.22
0    0.500121
1    0.499933
2    0.500034
3    0.261186
4    0.504717
5    0.009036
6    0.684775
7    0.276199
dtype: float64
0    0.137299
1    0.123924
2    0.086670
3    0.137098
4    0.048351
5    0.091184
6    0.079173
7    0.106491
dtype: float64


In [26]:
# labels preprocessing using numerical labels
unique_labels = labels.unique()
n_unique_labels = len(unique_labels)
labels.replace(unique_labels, np.arange(n_unique_labels), inplace=True)
print(labels.head())

0    0
1    0
2    0
3    1
4    0
Name: class_protein_localization, dtype: int64


In [30]:
# data concat
features['labels'] = labels
features.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,labels
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,0
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,0
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,0
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,1
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,0


In [31]:
# save as csv file
features.to_csv("yeast_scaled.csv")