In [60]:
import pandas as pd

In [61]:
ids = [11, 22, 33, 44, 55, 66, 77]
countries = ['Spain', 'France', 'Spain', 'Germany', 'France']

In [62]:
df = pd.DataFrame(list(zip(ids, countries)),
                  columns=['Ids', 'Countries'])

In [63]:
print(df)

   Ids Countries
0   11     Spain
1   22    France
2   33     Spain
3   44   Germany
4   55    France


In [64]:
dfOneHotEncoding = pd.get_dummies(df.Countries, prefix='Country')
print(dfOneHotEncoding.head())

   Country_France  Country_Germany  Country_Spain
0               0                0              1
1               1                0              0
2               0                0              1
3               0                1              0
4               1                0              0


In [65]:
df = pd.concat([df, dfOneHotEncoding], axis=1)
df

Unnamed: 0,Ids,Countries,Country_France,Country_Germany,Country_Spain
0,11,Spain,0,0,1
1,22,France,1,0,0
2,33,Spain,0,0,1
3,44,Germany,0,1,0
4,55,France,1,0,0


In [66]:
# Encoding to vectorize
from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(df.Countries)
y

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [67]:
from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(df).toarray()
print(y)

[[1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0.]]


In [68]:
# Encoding to vectorize
from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(df[['Countries']]).toarray()
print(y)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [69]:
df = pd.DataFrame.from_dict({
    'Age': [10, 35, 34, 23, 70, 55, 89],
    'Height_inch': [130, 178, 155, 133, 195, 150, 205],
    'Weight_lb': [80, 200, 220, 150, 140, 95, 180]
})

In [70]:
print(df)

   Age  Height_inch  Weight_lb
0   10          130         80
1   35          178        200
2   34          155        220
3   23          133        150
4   70          195        140
5   55          150         95
6   89          205        180


In [71]:
#Maxium Scale 
def absolute_maximum_scale(series):
    return series / series.abs().max()

In [72]:
for col in df.columns:
    df[col] = absolute_maximum_scale(df[col])

In [73]:
print(df)

        Age  Height_inch  Weight_lb
0  0.112360     0.634146   0.363636
1  0.393258     0.868293   0.909091
2  0.382022     0.756098   1.000000
3  0.258427     0.648780   0.681818
4  0.786517     0.951220   0.636364
5  0.617978     0.731707   0.431818
6  1.000000     1.000000   0.818182


In [74]:
# Use Scikit-learn to transform with maximum absolute scaling
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit(df)
scaled = scaler.transform(df)
scaled_df = pd.DataFrame(scaled, columns=df.columns)

In [75]:
print(scaled_df)

        Age  Height_inch  Weight_lb
0  0.112360     0.634146   0.363636
1  0.393258     0.868293   0.909091
2  0.382022     0.756098   1.000000
3  0.258427     0.648780   0.681818
4  0.786517     0.951220   0.636364
5  0.617978     0.731707   0.431818
6  1.000000     1.000000   0.818182


In [76]:
# Normalize a Pandas Column with Min-Max Feature Scaling using Pandas
def min_max_scaling(series):
    return (series - series.min()) / (series.max() - series.min())

In [77]:
for col in df.columns:
    df[col] = min_max_scaling(df[col])

In [78]:
print(df.head())

        Age  Height_inch  Weight_lb
0  0.000000     0.000000   0.000000
1  0.316456     0.640000   0.857143
2  0.303797     0.333333   1.000000
3  0.164557     0.040000   0.500000
4  0.759494     0.866667   0.428571


In [79]:
# Normalize a Pandas Column with Min-Max Feature Scaling using scikit-learn
from sklearn.preprocessing import MinMaxScaler

In [80]:
scaler = MinMaxScaler()
scaler.fit(df)
scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled, columns=df.columns)

In [81]:
print(scaled_df)

        Age  Height_inch  Weight_lb
0  0.000000     0.000000   0.000000
1  0.316456     0.640000   0.857143
2  0.303797     0.333333   1.000000
3  0.164557     0.040000   0.500000
4  0.759494     0.866667   0.428571
5  0.569620     0.266667   0.107143
6  1.000000     1.000000   0.714286


In [82]:
#Công thức Z-score
#xstd = (X – μ) / σ
#x130 = 130 - 163.71/29.53 = -1.14
#x178 = 178 - 163.71/29.53 = 0.448

In [83]:
def z_score_standardization(series):
    return (series - series.mean()) / series.std()

In [84]:
for col in df.columns:
    df[col] = z_score_standardization(df[col])

In [85]:
print(df)

        Age  Height_inch  Weight_lb
0 -1.270474    -1.141772  -1.384428
1 -0.366682     0.483802   0.918383
2 -0.402833    -0.295119   1.302185
3 -0.800502    -1.040174  -0.041122
4  0.898628     1.059526  -0.233023
5  0.356352    -0.464450  -1.096577
6  1.585510     1.398187   0.534581


In [86]:
#PP dùng thư viện python
# Standardize a Pandas Column with Z-Score Scaling using scikit-learn
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.fit_transform(df)


scaled_df = pd.DataFrame(scaled, columns=df.columns)


print(scaled_df)

        Age  Height_inch  Weight_lb
0 -1.372269    -1.233255  -1.495353
1 -0.396061     0.522566   0.991967
2 -0.435110    -0.318765   1.406520
3 -0.864641    -1.123516  -0.044416
4  0.970629     1.144419  -0.251693
5  0.384905    -0.501663  -1.184438
6  1.712547     1.510215   0.577414
