In [1]:
#Data Preparation for Machine Learning

from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer

In [2]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'

In [3]:
df = read_csv(url, header=None, na_values='?')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [4]:
#Split into input and output elements
data = df.values
X,y = data[:,:-1],data[:,:-1]

#Print missing values
print('Missing :%d' %sum(isnan(X).flatten()))

Missing :1605


In [5]:
imputer=SimpleImputer(strategy='mean')
imputer

SimpleImputer()

In [6]:
imputer.fit(X)

SimpleImputer()

In [7]:
Xtrans = imputer.transform(X)

In [8]:
print('Missing: %d'%sum(isnan(Xtrans).flatten()))

Missing: 0


In [9]:
#Select features with RFE

from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [10]:
#define dataset
X,y = make_classification(n_samples =1000, n_features =10,n_informative=5,n_redundant=5, random_state=1)

#define RFE

rfe = RFE(estimator=DecisionTreeClassifier(),n_features_to_select=5)
rfe.fit(X,y)
for i in range(X.shape[1]):
    print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected=False, Rank: 4
Column: 1, Selected=False, Rank: 6
Column: 2, Selected=True, Rank: 1
Column: 3, Selected=True, Rank: 1
Column: 4, Selected=True, Rank: 1
Column: 5, Selected=False, Rank: 5
Column: 6, Selected=True, Rank: 1
Column: 7, Selected=False, Rank: 3
Column: 8, Selected=True, Rank: 1
Column: 9, Selected=False, Rank: 2


In [11]:
#Scale Data with Normalization 

from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

X,y = make_classification(n_samples =1000, n_features =10,n_informative=5,n_redundant=5, random_state=1)
print(X[:3,:])



[[ 2.56999479 -0.13019997  3.16075093 -4.35936352 -1.61271951 -1.39352057
  -2.48924933 -1.93094078  3.26130366  2.05692145]
 [ 0.34129317  2.51321418 -0.80416572  1.29196568  2.05773105 -3.11098284
   1.46582984  6.24734437 -1.92769365  2.9503149 ]
 [ 2.27539972  3.36561455  0.17164362  1.24862039  0.30249838 -1.1378142
  -1.60819862  2.74693781  0.13492444  2.00339547]]


In [12]:
trans = MinMaxScaler()

In [13]:
X_norm = trans.fit_transform(X)
print(X_norm[:3,:])

[[0.67947109 0.58831825 0.80491637 0.28325228 0.41271351 0.39305349
  0.24687884 0.3554241  0.88935692 0.60189212]
 [0.50839186 0.85250861 0.34224488 0.7690801  0.78427257 0.20168622
  0.63991456 0.83102897 0.21224905 0.6922211 ]
 [0.65685743 0.93769993 0.45611339 0.76535384 0.60659068 0.42154543
  0.33443319 0.62746424 0.48139834 0.59648023]]


In [16]:
#Transform Categories with one-hot encoding
#Machine learning models require all input and output variables to be numeric

from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
dataset = read_csv(url,header=None)

In [19]:
#retrieve the array of  data
data=dataset.values
X=data[:,:-1].astype(str)
Y=data[:,-1].astype(str)
print(X[:3,:])

[["'40-49'" "'premeno'" "'15-19'" "'0-2'" "'yes'" "'3'" "'right'"
  "'left_up'" "'no'"]
 ["'50-59'" "'ge40'" "'15-19'" "'0-2'" "'no'" "'1'" "'right'" "'central'"
  "'no'"]
 ["'50-59'" "'ge40'" "'35-39'" "'0-2'" "'no'" "'2'" "'left'" "'left_low'"
  "'no'"]]


In [24]:
#Apply OneHotEncoder Transformation

encoder = OneHotEncoder(sparse=False)
X_oe = encoder.fit_transform(X)

#Summarize the transformed data
print(X_oe[:3,:])

[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]


In [26]:
#Transform Numbers to Categories With kBins

from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer



X,y = make_classification(n_samples =1000, n_features =10,n_informative=5,n_redundant=5, random_state=1)
print(X[:3,:])

[[ 2.56999479 -0.13019997  3.16075093 -4.35936352 -1.61271951 -1.39352057
  -2.48924933 -1.93094078  3.26130366  2.05692145]
 [ 0.34129317  2.51321418 -0.80416572  1.29196568  2.05773105 -3.11098284
   1.46582984  6.24734437 -1.92769365  2.9503149 ]
 [ 2.27539972  3.36561455  0.17164362  1.24862039  0.30249838 -1.1378142
  -1.60819862  2.74693781  0.13492444  2.00339547]]


In [33]:
trans = KBinsDiscretizer(n_bins =10,encode='ordinal', strategy='uniform')
X_discrete = trans.fit_transform(X)
print(X_discrete[:3,:])

[[6. 5. 8. 2. 4. 3. 2. 3. 8. 6.]
 [5. 8. 3. 7. 7. 2. 6. 8. 2. 6.]
 [6. 9. 4. 7. 6. 4. 3. 6. 4. 5.]]


In [36]:
#Dimensionality Reduction with PCA(Principal Component Analysis)
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA


X,y = make_classification(n_samples =1000, n_features =10,n_informative=5,n_redundant=5, random_state=1)
print(X[:3,:])

[[ 2.56999479 -0.13019997  3.16075093 -4.35936352 -1.61271951 -1.39352057
  -2.48924933 -1.93094078  3.26130366  2.05692145]
 [ 0.34129317  2.51321418 -0.80416572  1.29196568  2.05773105 -3.11098284
   1.46582984  6.24734437 -1.92769365  2.9503149 ]
 [ 2.27539972  3.36561455  0.17164362  1.24862039  0.30249838 -1.1378142
  -1.60819862  2.74693781  0.13492444  2.00339547]]


In [44]:
trans = PCA(n_components =3)
X_dim=trans.fit_transform(X)
print(X_dim[:3,:])

[[ 4.77765652  2.99691037  4.62336298]
 [-5.14824625 -4.93141748  0.28896085]
 [-1.60657696 -3.07170359  1.9735171 ]]
