In [71]:

import pandas as pd

df = pd.DataFrame({'Math': [4, 4, 7, 8, 12, 13, 15, 18, 22, 23, 23, 25],
                   'Eng': [2, 5, 4, 7, 7, 8, 5, 4, 5, 11, 13, 8]})


In [72]:
df.shape
df


Unnamed: 0,Math,Eng
0,4,2
1,4,5
2,7,4
3,8,7
4,12,7
5,13,8
6,15,5
7,18,4
8,22,5
9,23,11


In [73]:

df['bin1'] = pd.qcut(df['Math'], q=5)
df


Unnamed: 0,Math,Eng,bin1
0,4,2,"(3.999, 7.2]"
1,4,5,"(3.999, 7.2]"
2,7,4,"(3.999, 7.2]"
3,8,7,"(7.2, 12.4]"
4,12,7,"(7.2, 12.4]"
5,13,8,"(12.4, 16.8]"
6,15,5,"(12.4, 16.8]"
7,18,4,"(16.8, 22.8]"
8,22,5,"(16.8, 22.8]"
9,23,11,"(22.8, 25.0]"


In [74]:

df['bin1'].value_counts()


bin1
(3.999, 7.2]    3
(22.8, 25.0]    3
(7.2, 12.4]     2
(12.4, 16.8]    2
(16.8, 22.8]    2
Name: count, dtype: int64

In [75]:

df['bin2'] = pd.qcut(df['Math'], q=[0, .2, .4, .6, .8, 1])
df


Unnamed: 0,Math,Eng,bin1,bin2
0,4,2,"(3.999, 7.2]","(3.999, 7.2]"
1,4,5,"(3.999, 7.2]","(3.999, 7.2]"
2,7,4,"(3.999, 7.2]","(3.999, 7.2]"
3,8,7,"(7.2, 12.4]","(7.2, 12.4]"
4,12,7,"(7.2, 12.4]","(7.2, 12.4]"
5,13,8,"(12.4, 16.8]","(12.4, 16.8]"
6,15,5,"(12.4, 16.8]","(12.4, 16.8]"
7,18,4,"(16.8, 22.8]","(16.8, 22.8]"
8,22,5,"(16.8, 22.8]","(16.8, 22.8]"
9,23,11,"(22.8, 25.0]","(22.8, 25.0]"


In [76]:

df['bin3'] = pd.qcut(df['Math'],
                           q=[0, .2, .4, .6, .8, 1],
                           labels=['A', 'B', 'C', 'D', 'E'])
df


Unnamed: 0,Math,Eng,bin1,bin2,bin3
0,4,2,"(3.999, 7.2]","(3.999, 7.2]",A
1,4,5,"(3.999, 7.2]","(3.999, 7.2]",A
2,7,4,"(3.999, 7.2]","(3.999, 7.2]",A
3,8,7,"(7.2, 12.4]","(7.2, 12.4]",B
4,12,7,"(7.2, 12.4]","(7.2, 12.4]",B
5,13,8,"(12.4, 16.8]","(12.4, 16.8]",C
6,15,5,"(12.4, 16.8]","(12.4, 16.8]",C
7,18,4,"(16.8, 22.8]","(16.8, 22.8]",D
8,22,5,"(16.8, 22.8]","(16.8, 22.8]",D
9,23,11,"(22.8, 25.0]","(22.8, 25.0]",E


In [77]:

df['bin4'] = pd.cut(df['Eng'],
                      bins=[0, 5, 8, float('Inf')],
                      labels=['Bad', 'OK', 'Good'])
df


Unnamed: 0,Math,Eng,bin1,bin2,bin3,bin4
0,4,2,"(3.999, 7.2]","(3.999, 7.2]",A,Bad
1,4,5,"(3.999, 7.2]","(3.999, 7.2]",A,Bad
2,7,4,"(3.999, 7.2]","(3.999, 7.2]",A,Bad
3,8,7,"(7.2, 12.4]","(7.2, 12.4]",B,OK
4,12,7,"(7.2, 12.4]","(7.2, 12.4]",B,OK
5,13,8,"(12.4, 16.8]","(12.4, 16.8]",C,OK
6,15,5,"(12.4, 16.8]","(12.4, 16.8]",C,Bad
7,18,4,"(16.8, 22.8]","(16.8, 22.8]",D,Bad
8,22,5,"(16.8, 22.8]","(16.8, 22.8]",D,Bad
9,23,11,"(22.8, 25.0]","(22.8, 25.0]",E,Good


In [78]:

df['bin4'].value_counts()


bin4
Bad     6
OK      4
Good    2
Name: count, dtype: int64

In [79]:


import pandas as pd

gdp = [11, 22, 33, 44, 55, 66, 77]
countries = ['Spain', 'France', 'Spain', 'Germany', 'France']
df = pd.DataFrame(list(zip(gdp, countries)),
                  columns=['GDP', 'Countries'])
df


Unnamed: 0,GDP,Countries
0,11,Spain
1,22,France
2,33,Spain
3,44,Germany
4,55,France


In [80]:

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['labelCountries']= label_encoder.fit_transform(df['Countries']) 
df


Unnamed: 0,GDP,Countries,labelCountries
0,11,Spain,2
1,22,France,0
2,33,Spain,2
3,44,Germany,1
4,55,France,0


In [81]:

dfOneHotEncoding = pd.get_dummies(df.Countries, prefix='Country')
print(dfOneHotEncoding.head())


   Country_France  Country_Germany  Country_Spain
0           False            False           True
1            True            False          False
2           False            False           True
3           False             True          False
4            True            False          False


In [82]:

df = pd.concat([df, dfOneHotEncoding], axis=1)
df


Unnamed: 0,GDP,Countries,labelCountries,Country_France,Country_Germany,Country_Spain
0,11,Spain,2,False,False,True
1,22,France,0,True,False,False
2,33,Spain,2,False,False,True
3,44,Germany,1,False,True,False
4,55,France,0,True,False,False


In [83]:

from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(df.Countries)
y


array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [84]:

from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(df[['Countries']]).toarray()
print(y)


[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [85]:


import pandas as pd

year = [2010,2011,2012,2013,2014,2015]

salaryHCM = [8.5,9.0,10.5,12.0,11.0,10.0]

salaryLA = [5.5,6.0,6.0,5.5,6.0,5.0]

df = pd.DataFrame(list(zip(year, salaryHCM,salaryLA)),
                  columns=['Year', 'HCM','LA'])
df


Unnamed: 0,Year,HCM,LA
0,2010,8.5,5.5
1,2011,9.0,6.0
2,2012,10.5,6.0
3,2013,12.0,5.5
4,2014,11.0,6.0
5,2015,10.0,5.0


In [86]:

max_la = df['LA'].abs().max()
df['macLA'] = df['LA'] / max_la

df


Unnamed: 0,Year,HCM,LA,macLA
0,2010,8.5,5.5,0.916667
1,2011,9.0,6.0,1.0
2,2012,10.5,6.0,1.0
3,2013,12.0,5.5,0.916667
4,2014,11.0,6.0,1.0
5,2015,10.0,5.0,0.833333


In [87]:

max_hcm = df['HCM'].abs().max()
df['macHCM'] = df['HCM'] / max_hcm

df

Unnamed: 0,Year,HCM,LA,macLA,macHCM
0,2010,8.5,5.5,0.916667,0.708333
1,2011,9.0,6.0,1.0,0.75
2,2012,10.5,6.0,1.0,0.875
3,2013,12.0,5.5,0.916667,1.0
4,2014,11.0,6.0,1.0,0.916667
5,2015,10.0,5.0,0.833333,0.833333


In [88]:

from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaled = scaler.fit_transform(df[['HCM','LA']])
scaled_df = pd.DataFrame(scaled, columns=df[['HCM','LA']].columns)

print(scaled_df)


        HCM        LA
0  0.708333  0.916667
1  0.750000  1.000000
2  0.875000  1.000000
3  1.000000  0.916667
4  0.916667  1.000000
5  0.833333  0.833333


In [89]:

vSc = scaler.transform([[4.5,5.75]])
vSc



array([[0.375     , 0.95833333]])

In [90]:

vSc = scaler.transform([[11.0,4.5]])
vSc




array([[0.91666667, 0.75      ]])

In [91]:

vR = scaler.inverse_transform([[0.5,0.8]])
vR


array([[6. , 4.8]])

In [97]:

vR = scaler.inverse_transform([[0.65,1.9]])
vR

array([[10.93269901,  6.37475486]])

In [92]:

min_la = df['LA'].min()
max_la = df['LA'].max()
df['mmcLA'] = (df['LA'] - min_la) / (max_la - min_la)

df


Unnamed: 0,Year,HCM,LA,macLA,macHCM,mmcLA
0,2010,8.5,5.5,0.916667,0.708333,0.5
1,2011,9.0,6.0,1.0,0.75,1.0
2,2012,10.5,6.0,1.0,0.875,1.0
3,2013,12.0,5.5,0.916667,1.0,0.5
4,2014,11.0,6.0,1.0,0.916667,1.0
5,2015,10.0,5.0,0.833333,0.833333,0.0


In [98]:
df[['HCM','LA']].describe()

Unnamed: 0,HCM,LA
count,6.0,6.0
mean,10.166667,5.666667
std,1.290994,0.408248
min,8.5,5.0
25%,9.25,5.5
50%,10.25,5.75
75%,10.875,6.0
max,12.0,6.0


In [93]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[['HCM','LA']])
scaled_df = pd.DataFrame(scaled, columns=df[['HCM','LA']].columns)

print(scaled_df)



        HCM   LA
0  0.000000  0.5
1  0.142857  1.0
2  0.571429  1.0
3  1.000000  0.5
4  0.714286  1.0
5  0.428571  0.0


In [94]:

mean_la = df['LA'].mean()
std_la = df['LA'].std()
df['zLA'] = (df['LA'] - mean_la) / std_la

df


Unnamed: 0,Year,HCM,LA,macLA,macHCM,mmcLA,zLA
0,2010,8.5,5.5,0.916667,0.708333,0.5,-0.408248
1,2011,9.0,6.0,1.0,0.75,1.0,0.816497
2,2012,10.5,6.0,1.0,0.875,1.0,0.816497
3,2013,12.0,5.5,0.916667,1.0,0.5,-0.408248
4,2014,11.0,6.0,1.0,0.916667,1.0,0.816497
5,2015,10.0,5.0,0.833333,0.833333,0.0,-1.632993


In [99]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(df[['HCM','LA']])

scaled_df = pd.DataFrame(scaled, columns=df[['HCM','LA']].columns)

print(scaled_df)


        HCM        LA
0 -1.414214 -0.447214
1 -0.989949  0.894427
2  0.282843  0.894427
3  1.555635 -0.447214
4  0.707107  0.894427
5 -0.141421 -1.788854


In [96]:

def funcX(x):
    return x*x + 1

df['TransformX'] = funcX(df[["LA"]])

df


Unnamed: 0,Year,HCM,LA,macLA,macHCM,mmcLA,zLA,TransformX
0,2010,8.5,5.5,0.916667,0.708333,0.5,-0.408248,31.25
1,2011,9.0,6.0,1.0,0.75,1.0,0.816497,37.0
2,2012,10.5,6.0,1.0,0.875,1.0,0.816497,37.0
3,2013,12.0,5.5,0.916667,1.0,0.5,-0.408248,31.25
4,2014,11.0,6.0,1.0,0.916667,1.0,0.816497,37.0
5,2015,10.0,5.0,0.833333,0.833333,0.0,-1.632993,26.0


Bài tập
Sinh viên tính vẽ biểu đồ phân phối lương trung bình của HCM
Tính skenewss và kurtorsis của lương trung bình HCM
Sử dụng hàm log để transform dữ liệu
Vẽ biểu đồ phân phối của dữ liệu lương trung bình của HCM sau khi transform
Tính skenewss và kurtorsis của dữ liệu lương trụng bình HCM sau khi transform
