In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### 데이터 불러오기

In [2]:
NBA_FILE_PATH = 'NBA_player_of_the_week.csv'
nba_player_of_the_week_df = pd.read_csv(NBA_FILE_PATH)

In [3]:
nba_player_of_the_week_df.head()

Unnamed: 0,Player,Team,Conference,Date,Position,Height,Weight,Age,Draft Year,Seasons in league,Season,Season short,Pre-draft Team,Real_value,Height CM,Weight KG,Last Season
0,Jayson Tatum,Boston Celtics,East,"Feb 10, 2020",SF,6'8,208,21,2017,2,2019-2020,2020,Duke,0.5,203,94,1
1,Nikola Jokic,Denver Nuggets,West,"Feb 10, 2020",C,7'0,250,25,2014,4,2019-2020,2020,KK Mega Bemax (Serbia),0.5,213,113,1
2,Jaylen Brown,Boston Celtics,East,"Feb 3, 2020",SF,6'7,220,23,2016,3,2019-2020,2020,California,0.5,201,99,1
3,Damian Lillard,Portland Trail Blazers,West,"Feb 3, 2020",G,6'3,195,29,2012,7,2019-2020,2020,Weber State,0.5,190,88,1
4,Pascal Siakam,Toronto Raptors,East,"Jan 27, 2020",F,6'9,230,25,2016,3,2019-2020,2020,New Mexico State,0.5,206,104,1


#### 각 열에 대한 통계 확인

In [4]:
nba_player_of_the_week_df.describe()

Unnamed: 0,Weight,Age,Draft Year,Seasons in league,Season short,Real_value,Height CM,Weight KG,Last Season
count,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0
mean,224.567164,26.73806,1996.287313,5.740299,2003.156716,0.68694,201.071642,101.384328,0.023881
std,30.798885,3.400683,11.253558,3.293421,11.470164,0.242007,9.36797,14.011226,0.152734
min,150.0,19.0,1965.0,0.0,1980.0,0.5,175.0,68.0,0.0
25%,205.0,24.0,1987.0,3.0,1994.0,0.5,193.0,93.0,0.0
50%,220.0,26.0,1998.0,5.0,2005.0,0.5,201.0,99.0,0.0
75%,250.0,29.0,2005.0,8.0,2013.0,1.0,208.0,113.0,0.0
max,325.0,40.0,2018.0,17.0,2020.0,1.0,229.0,147.0,1.0


In [5]:
height_weight_age_df = nba_player_of_the_week_df[['Height CM', 'Weight KG','Age']]
height_weight_age_df.head()

Unnamed: 0,Height CM,Weight KG,Age
0,203,94,21
1,213,113,25
2,201,99,23
3,190,88,29
4,206,104,25


## Feature Scaling
입력 변수들의 크기를 조정해서 일정 범위 내에 떨어지도록 바꾸는 것<br>
선형 회귀뿐만 아니라 경사 하강법을 사용하는 모든 알고리즘의 연산속도를 빠르게 해준다<br>
횟수 자체는 동일하지만 경사 하강시 정규화로인해 하강 거리가 가까워지기 때문

### MinMaxScaler
최댓값, 최솟값을 이용해서 변수의 크기를 0과 1사이로 조정
* 경사하강법을 좀 더 빨리 할 수 있도록 도와준다

In [7]:
scaler = preprocessing.MinMaxScaler()
normalized_data = scaler.fit_transform(height_weight_age_df)
normalized_data

array([[0.51851852, 0.32911392, 0.0952381 ],
       [0.7037037 , 0.56962025, 0.28571429],
       [0.48148148, 0.39240506, 0.19047619],
       ...,
       [0.48148148, 0.37974684, 0.23809524],
       [0.38888889, 0.21518987, 0.23809524],
       [0.42592593, 0.27848101, 0.52380952]])

In [9]:
normalized_df = pd.DataFrame(normalized_data,columns=['Height','Weight','Age'])
normalized_df

Unnamed: 0,Height,Weight,Age
0,0.518519,0.329114,0.095238
1,0.703704,0.569620,0.285714
2,0.481481,0.392405,0.190476
3,0.277778,0.253165,0.476190
4,0.574074,0.455696,0.285714
...,...,...,...
1335,0.240741,0.139241,0.238095
1336,0.574074,0.594937,0.047619
1337,0.481481,0.379747,0.238095
1338,0.388889,0.215190,0.238095


#### 최댓값은 1, 최솟값은 0
데이터가 0과 1사이로 normalize되었음을 확인

In [10]:
normalized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,0.482808,0.422586,0.368479
std,0.173481,0.177357,0.161937
min,0.0,0.0,0.0
25%,0.333333,0.316456,0.238095
50%,0.481481,0.392405,0.333333
75%,0.611111,0.56962,0.47619
max,1.0,1.0,1.0


### Standardization

표준화를 하면 항상 새로운 데이터의 평균은 0, 표준 편차는 1이 된다.<br>
표준화를 해준 데이터를 z-score라고 하며, <br>
z-score는 데이터가 평균 값에서 몇 표준 편차만큼 떨어져 있는지를 뜻한다.

In [23]:
# 소수점 5번째 자리까지만 출력되도록 설정
pd.set_option('display.float_format', lambda x: '%.5f' % x)
    
nba_player_of_the_week_df = pd.read_csv(NBA_FILE_PATH)
    
# 데이터를 standardize 함
scaler = preprocessing.StandardScaler()
standardized_data = scaler.fit_transform(height_weight_age_df)
    
standardized_df = pd.DataFrame(standardized_data, columns=['Height', 'Weight', 'Age'])

In [24]:
standardized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,-0.0,-0.0,-0.0
std,1.00037,1.00037,1.00037
min,-2.7841,-2.38357,-2.27629
25%,-0.86194,-0.59862,-0.80545
50%,-0.00765,-0.17024,-0.21711
75%,0.73986,0.82934,0.66539
max,2.98237,3.25687,3.90124


### 간 질환 환자 데이터를 이용한 Normalization

In [26]:
liver_patients_df = pd.read_csv('liver_patient_data.csv')

# Normalization할 열 이름들
features_to_normalize = ['Total_Bilirubin','Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase']

In [31]:
liver_patients_df = liver_patients_df.iloc[:,2:]

In [33]:
liver_patients_df.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')

* 'Total_Bilirubin'
* 'Direct_Bilirubin'
* 'Alkaline_Phosphotase'
* 'Alamine_Aminotransferase’
열들 데이터를 Normalize

In [37]:
liver_patients_df4 = liver_patients_df[['Total_Bilirubin', 'Direct_Bilirubin','Alkaline_Phosphotase','Alkaline_Phosphotase']]
liver_patients_df4.head()

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alkaline_Phosphotase.1
0,0.7,0.1,187,187
1,10.9,5.5,699,699
2,7.3,4.1,490,490
3,1.0,0.4,182,182
4,3.9,2.0,195,195


In [38]:
scaler = preprocessing.MinMaxScaler()
normalized_data_liver = scaler.fit_transform(liver_patients_df4)
normalized_data_liver

array([[0.00402145, 0.        , 0.06057645, 0.06057645],
       [0.14075067, 0.2755102 , 0.31069858, 0.31069858],
       [0.0924933 , 0.20408163, 0.20859795, 0.20859795],
       ...,
       [0.00536193, 0.00510204, 0.0889106 , 0.0889106 ],
       [0.01206434, 0.02040816, 0.05911089, 0.05911089],
       [0.0080429 , 0.01020408, 0.07474353, 0.07474353]])

In [39]:
normalized_data_liver_df = pd.DataFrame(normalized_data_liver, columns= ['Total_Bilirubin', 'Direct_Bilirubin','Alkaline_Phosphotase','Alkaline_Phosphotase'])

In [40]:
normalized_data_liver_df

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alkaline_Phosphotase.1
0,0.00402,0.00000,0.06058,0.06058
1,0.14075,0.27551,0.31070,0.31070
2,0.09249,0.20408,0.20860,0.20860
3,0.00804,0.01531,0.05813,0.05813
4,0.04692,0.09694,0.06448,0.06448
...,...,...,...,...
574,0.00134,0.00000,0.21348,0.21348
575,0.00268,0.00000,0.01710,0.01710
576,0.00536,0.00510,0.08891,0.08891
577,0.01206,0.02041,0.05911,0.05911


In [41]:
normalized_data_liver_df.describe()

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alkaline_Phosphotase.1
count,579.0,579.0,579.0,579.0
mean,0.03908,0.07113,0.11156,0.11156
std,0.08348,0.1437,0.11898,0.11898
min,0.0,0.0,0.0,0.0
25%,0.00536,0.0051,0.05496,0.05496
50%,0.00804,0.0102,0.07084,0.07084
75%,0.02949,0.06122,0.1148,0.1148
max,1.0,1.0,1.0,1.0
