# Data Standardization and Encoding

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


## Import datasets

In [2]:
dataset=sklearn.datasets.load_breast_cancer()
dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

## Load data into pandas data frame 

In [3]:
df=pd.DataFrame(dataset.data,columns=dataset.feature_names)

In [4]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
df.shape

(569, 30)

## We load the target into another variable(column) so as to have our features vs target

In [6]:
X=df
Y=dataset.target

### We split data into train and test data before standardization

In [7]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [8]:
print(X.shape,X_train.shape,X_test.shape)

(569, 30) (455, 30) (114, 30)


### We can carry out standardization

In [9]:
print(dataset.data.std())

228.29740508276657


In [10]:
scaler=StandardScaler()
scaler.fit(X_train)

In [11]:
X_train_standardized=scaler.transform(X_train)

In [12]:
X_train_standardized.std()

1.0

In [13]:
X_test_standardized=scaler.transform(X_test)

# Data Encoding 

In [19]:
from sklearn.preprocessing import LabelEncoder

In [16]:
data=pd.read_csv("C:/Users/HP/Desktop/DataScience/Data\Historical Presidents Physical Data (More).csv")
data.head()

Unnamed: 0,order,name,height_cm,height_in,weight_kg,weight_lb,body_mass_index,body_mass_index_range,birth_day,birth_month,...,term_begin_year,term_begin_date,term_end_day,term_end_month,term_end_year,term_end_date,presidency_begin_age,presidency_end_age,political_party,corrected_iq
0,1,George Washington,188,74.0,79.4,175,22.5,Normal,22,2,...,1789,30-04-1789,4.0,3.0,1797.0,04-03-1797,57,65.0,Unaffiliated,140.0
1,2,John Adams,170,67.0,83.9,185,29.0,Overweight,30,10,...,1797,04-03-1797,4.0,3.0,1801.0,04-03-1801,61,65.0,Federalist,155.0
2,3,Thomas Jefferson,189,74.5,82.1,181,23.0,Normal,13,4,...,1801,04-03-1801,4.0,3.0,1809.0,04-03-1809,57,65.0,Democratic-Republican,160.0
3,4,James Madison,163,64.0,55.3,122,20.8,Normal,16,3,...,1809,04-03-1809,4.0,3.0,1817.0,04-03-1817,57,65.0,Democratic-Republican,160.0
4,5,James Monroe,183,72.0,85.7,189,25.6,Overweight,28,4,...,1817,04-03-1817,4.0,3.0,1825.0,04-03-1825,58,66.0,Democratic-Republican,139.0


### Finding the count of different labels in the categories

In [17]:
data['body_mass_index_range'].value_counts()

body_mass_index_range
Normal            22
Overweight        17
Obese              4
Severely Obese     1
Morbidly Obese     1
Name: count, dtype: int64

### Label Encoder Function

In [20]:
label_encode=LabelEncoder()

labels=label_encode.fit_transform(data.body_mass_index_range)


### Append the labels to the data frame 

In [21]:
data['new_target']=labels
data.head()

Unnamed: 0,order,name,height_cm,height_in,weight_kg,weight_lb,body_mass_index,body_mass_index_range,birth_day,birth_month,...,term_begin_date,term_end_day,term_end_month,term_end_year,term_end_date,presidency_begin_age,presidency_end_age,political_party,corrected_iq,new_target
0,1,George Washington,188,74.0,79.4,175,22.5,Normal,22,2,...,30-04-1789,4.0,3.0,1797.0,04-03-1797,57,65.0,Unaffiliated,140.0,1
1,2,John Adams,170,67.0,83.9,185,29.0,Overweight,30,10,...,04-03-1797,4.0,3.0,1801.0,04-03-1801,61,65.0,Federalist,155.0,3
2,3,Thomas Jefferson,189,74.5,82.1,181,23.0,Normal,13,4,...,04-03-1801,4.0,3.0,1809.0,04-03-1809,57,65.0,Democratic-Republican,160.0,1
3,4,James Madison,163,64.0,55.3,122,20.8,Normal,16,3,...,04-03-1809,4.0,3.0,1817.0,04-03-1817,57,65.0,Democratic-Republican,160.0,1
4,5,James Monroe,183,72.0,85.7,189,25.6,Overweight,28,4,...,04-03-1817,4.0,3.0,1825.0,04-03-1825,58,66.0,Democratic-Republican,139.0,3


In [23]:
data['new_target'].value_counts()

new_target
1    22
3    17
2     4
4     1
0     1
Name: count, dtype: int64

Thats the clearcut way of carrying out data standardization and encoding in Python