## MinMaxScaler

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import MinMaxScaler 

In [3]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv("pima-indians-diabetes.csv", names=names) 
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
dataframe.shape

(768, 9)

In [5]:
array = dataframe.values 
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [6]:
# separate array into input and output components 
X = array[:,0:8] 
Y = array[:,8] 
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [7]:
scaler = MinMaxScaler(feature_range=(0, 1)) 
rescaledX = scaler.fit_transform(X) 

In [8]:
rescaledX

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

In [9]:
# summarize transformed data ; 
#These options determine the way floating point numbers, arrays and other NumPy objects are displayed.
np.set_printoptions(precision=6) 
rescaledX[0:5,:]

array([[0.352941, 0.743719, 0.590164, 0.353535, 0.      , 0.500745,
        0.234415, 0.483333],
       [0.058824, 0.427136, 0.540984, 0.292929, 0.      , 0.396423,
        0.116567, 0.166667],
       [0.470588, 0.919598, 0.52459 , 0.      , 0.      , 0.347243,
        0.253629, 0.183333],
       [0.058824, 0.447236, 0.540984, 0.232323, 0.111111, 0.418778,
        0.038002, 0.      ],
       [0.      , 0.688442, 0.327869, 0.353535, 0.198582, 0.642325,
        0.943638, 0.2     ]])

## Binarizer

In [10]:
# Python code for binarization 
from sklearn.preprocessing import Binarizer  
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv("pima-indians-diabetes.csv", names=names) 
array = dataframe.values 

In [11]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [13]:
# separate array into input and output components 
# inarize data (set feature values to 0 or 1) according to a threshold
X = array[:,0:8] 
Y = array[:,8] 
X
binarizer = Binarizer(threshold=7.0).fit(X) 
binaryX = binarizer.transform(X) 

In [14]:
binaryX

array([[0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       ...,
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.]])

In [15]:
# summarize transformed data 
np.set_printoptions(precision=3) 
binaryX[0:5,:]

array([[0., 1., 1., 1., 0., 1., 0., 1.],
       [0., 1., 1., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 1., 1., 1., 0., 1.],
       [0., 1., 1., 1., 1., 1., 0., 1.]])

## StandardScaler

In [16]:
# Python code to Standardize data (0 mean, 1 stdev) 
from sklearn.preprocessing import StandardScaler 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv("pima-indians-diabetes.csv", names=names) 
array = dataframe.values 

In [17]:
# separate array into input and output components 
# Standardize features by removing the mean and scaling to unit variance
X = array[:,0:8] 
Y = array[:,8] 
scaler = StandardScaler().fit(X) 
rescaledX = scaler.transform(X) 

In [18]:
rescaledX

array([[ 0.64 ,  0.848,  0.15 , ...,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161, ..., -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, ..., -1.103,  0.604, -0.106],
       ...,
       [ 0.343,  0.003,  0.15 , ..., -0.735, -0.685, -0.276],
       [-0.845,  0.16 , -0.471, ..., -0.24 , -0.371,  1.171],
       [-0.845, -0.873,  0.046, ..., -0.202, -0.474, -0.871]])

In [19]:
# summarize transformed data 
np.set_printoptions(precision=3) 
rescaledX[0:5,:]

array([[ 0.64 ,  0.848,  0.15 ,  0.907, -0.693,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161,  0.531, -0.693, -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, -1.288, -0.693, -1.103,  0.604, -0.106],
       [-0.845, -0.998, -0.161,  0.155,  0.123, -0.494, -0.921, -1.042],
       [-1.142,  0.504, -1.505,  0.907,  0.766,  1.41 ,  5.485, -0.02 ]])

# Libraries

In [20]:
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data

# error verse yuxaridaki from sklearn.impute import SimpleImputer i silib asagidaki kommentdekini run edin

#from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data

In [22]:
dataset = pd.read_csv('DataPreprocessing.csv') 
# to import the dataset into a variable
# Splitting the attributes into independent and dependent attributes
X = dataset.iloc[:, :-1].values # attributes to determine dependent variable / Class
Y = dataset.iloc[:, -1].values # dependent variable / Class

In [23]:
dataset.head()

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes


In [24]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, nan],
       ['India', 40.0, 69600.0],
       ['Brazil', nan, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

## Simple Imputer

In [25]:
# handling the missing data and replace missing values with nan from numpy and replace with mean of all the other values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# error verse yuxaridakini silib asagida comment olaraq yazdigim kodu isledin

#imputer = Imputer(missing_values=np.nan, strategy='mean') 
imputer = imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [26]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, 76533.33333333333],
       ['India', 40.0, 69600.0],
       ['Brazil', 43.77777777777778, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

## Label encoder

In [27]:
labelencoder_X = LabelEncoder() # Encode target labels with value between 0 and n_classes-1.
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[1, 49.0, 86400.0],
       [0, 32.0, 57600.0],
       [2, 35.0, 64800.0],
       [0, 43.0, 73200.0],
       [2, 45.0, 76533.33333333333],
       [1, 40.0, 69600.0],
       [0, 43.77777777777778, 62400.0],
       [1, 53.0, 94800.0],
       [2, 55.0, 99600.0],
       [1, 42.0, 80400.0]], dtype=object)