# Attribute Normalization, Standardization and Dimension reduction of data

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing as prp
from sklearn.decomposition import PCA

Using Pima Indians Diabetes Database as a csv file

In [3]:
df = pd.read_csv("pima-indians-diabetes.csv")
df.head()

Unnamed: 0,pregs,plas,pres,skin,test,BMI,pedi,Age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data = df.drop(['class'], axis = 1)
data.head()

Unnamed: 0,pregs,plas,pres,skin,test,BMI,pedi,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


Replacing the outliers with the median of the respective attributes :-

In [21]:
attributesfr = list(data.columns)
def outliers(x):  #Function for outliers
    IQR = np.percentile(data[x],75) - np.percentile(data[x],25)
    minimum= np.percentile(data[x],25) - (1.5*IQR)#conditions for outliers
    maximum= np.percentile(data[x],75) + (1.5*IQR)
    outliers_=pd.concat((data[x][data[x]< minimum],data[x][data[x]> maximum]))
    return outliers_


for i in attributesfr:
    data[i].replace(data[i][list(outliers(i).index)],data[i].median(),inplace = True)

Min-Max normalization of the outlier corrected data to scale the attribute values in the range 5 to 12 :-

In [23]:
min_max_scaler = prp.MinMaxScaler(feature_range=(5, 12))
x_scaled = min_max_scaler.fit_transform(data)
minMaxNormalized = pd.DataFrame(x_scaled)
minMaxNormalized.rename(columns={i: list(data)[i] for i in range(8)}, inplace=True)
print(minMaxNormalized.head())

      pregs       plas     pres      skin      test        BMI      pedi  \
0  8.230769   9.696774  8.50000  8.888889  5.000000   8.522876  9.070975   
1  5.538462   6.851613  7.84375  8.222222  5.000000   6.921569  7.024364   
2  9.307692  11.277419  7.62500  5.000000  5.000000   6.166667  9.404661   
3  5.538462   7.032258  7.84375  7.555556  7.550388   7.264706  5.659958   
4  5.000000   9.200000  5.00000  8.888889  9.558140  10.696078  7.183792   

      Age  
0  10.075  
1   6.750  
2   6.925  
3   5.000  
4   7.100  


Standardization each selected attribute using the relation 𝑥̂n= (xn − μ)/σ where μ
is mean and σ is standard deviation of that attribute :-

In [29]:
#def z_score(x):
    #x_sc = (x - x.mean())/x.std()
    #return x_sc 

from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(data)
print(X_scaled)

[[ 0.67842689  0.86604475 -0.00994497 ...  0.23770786  0.99892887
   1.78318807]
 [-0.85132017 -1.20506583 -0.56207036 ... -0.87681983 -0.26574929
  -0.10247613]
 [ 1.29032571  2.01666174 -0.74611216 ... -1.40224003  1.2051264
  -0.00323065]
 ...
 [ 0.37247747 -0.02157407 -0.00994497 ... -0.94050713 -0.75145903
  -0.20172162]
 [-0.85132017  0.14279979 -1.11419575 ... -0.31955599 -0.27491363
   1.48545161]
 [-0.85132017 -0.94206766 -0.19398676 ... -0.27179051 -0.43070732
  -0.89644   ]]


# Data Reduction using PCA

Doing PCA without importing it from library

In [30]:
features = X_scaled.T
cov_matrix = np.cov(features)

values, vectors = np.linalg.eig(cov_matrix)

explained_variances = []
for i in range(len(values)):
    explained_variances.append(values[i] / np.sum(values))
 

projected_1 = X_scaled.dot(vectors.T[0])
projected_2 = X_scaled.dot(vectors.T[1])
res = pd.DataFrame(projected_1, columns=['PC1'])
res["PC2"] = projected_2
print(res)

          PC1       PC2
0    1.864413  0.069728
1   -1.469234  0.294411
2    0.460198  1.490386
3   -1.919455 -0.321461
4   -0.753379 -2.546653
..        ...       ...
763  0.878117 -1.173296
764 -0.377143 -0.269287
765 -0.272631  0.050026
766 -0.354947  1.399579
767 -1.434987 -0.131366

[768 rows x 2 columns]


By using library

In [32]:
pca = PCA(n_components=2)
pca.fit(X_scaled)
prComp = pca.fit_transform(X_scaled)
prDf = pd.DataFrame(data=prComp, columns=['PC1', 'PC2'])
print(prDf)

          PC1       PC2
0    1.864413 -0.069728
1   -1.469234 -0.294411
2    0.460198 -1.490386
3   -1.919455  0.321461
4   -0.753379  2.546653
..        ...       ...
763  0.878117  1.173296
764 -0.377143  0.269287
765 -0.272631 -0.050026
766 -0.354947 -1.399579
767 -1.434987  0.131366

[768 rows x 2 columns]
