In [1]:
import pandas as pd
import numpy as np

## 데이터 변환

In [2]:
n_samples = 10
height = 3*np.random.randn(n_samples).round() + 170
nationality = np.random.randint(0,3,n_samples)

In [4]:
height, nationality

(array([170., 167., 173., 164., 167., 170., 164., 173., 170., 173.]),
 array([0, 0, 0, 0, 1, 2, 2, 0, 1, 1]))

In [6]:
list(zip(height, nationality))

[(170.0, 0),
 (167.0, 0),
 (173.0, 0),
 (164.0, 0),
 (167.0, 1),
 (170.0, 2),
 (164.0, 2),
 (173.0, 0),
 (170.0, 1),
 (173.0, 1)]

In [8]:
df = pd.DataFrame(list(zip(height, nationality)), 
                  columns=["height","nationality"])
df.head()

Unnamed: 0,height,nationality
0,170.0,0
1,167.0,0
2,173.0,0
3,164.0,0
4,167.0,1


In [9]:
nat = pd.get_dummies(df['nationality'], prefix='nat_')

In [10]:
nat

Unnamed: 0,nat__0,nat__1,nat__2
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,1,0
5,0,0,1
6,0,0,1
7,1,0,0
8,0,1,0
9,0,1,0


In [11]:
new_df = pd.concat([df, nat], axis=1); new_df.head()

Unnamed: 0,height,nationality,nat__0,nat__1,nat__2
0,170.0,0,1,0,0
1,167.0,0,1,0,0
2,173.0,0,1,0,0
3,164.0,0,1,0,0
4,167.0,1,0,1,0


In [12]:
new_df.drop('nationality', axis=1, inplace=True)

In [13]:
new_df

Unnamed: 0,height,nat__0,nat__1,nat__2
0,170.0,1,0,0
1,167.0,1,0,0
2,173.0,1,0,0
3,164.0,1,0,0
4,167.0,0,1,0
5,170.0,0,0,1
6,164.0,0,0,1
7,173.0,1,0,0
8,170.0,0,1,0
9,173.0,0,1,0


In [14]:
nationality

array([0, 0, 0, 0, 1, 2, 2, 0, 1, 1])

In [15]:
nat_categ = pd.Categorical(nationality)
nat_categ

[0, 0, 0, 0, 1, 2, 2, 0, 1, 1]
Categories (3, int64): [0, 1, 2]

In [16]:
# 주의: Series로 변경된다
df['categ'] = nat_categ

In [17]:
df

Unnamed: 0,height,nationality,categ
0,170.0,0,0
1,167.0,0,0
2,173.0,0,0
3,164.0,0,0
4,167.0,1,1
5,170.0,2,2
6,164.0,2,2
7,173.0,0,0
8,170.0,1,1
9,173.0,1,1


In [18]:
type(df.categ)

pandas.core.series.Series

In [19]:
type(nat_categ)

pandas.core.arrays.categorical.Categorical

## 표준 스케일링

In [42]:
height = 3*np.random.randn(n_samples).round() + 170
weight = 4*np.random.randn(n_samples).round() + 70

X = pd.DataFrame(list(zip(height, weight)));X.head()

Unnamed: 0,0,1
0,170.0,66.0
1,170.0,70.0
2,170.0,74.0
3,173.0,78.0
4,173.0,74.0


In [39]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X);X_std

array([[-0.46852129,  0.73730873],
       [-0.46852129, -0.08192319],
       [-0.46852129,  1.55654065],
       [ 1.09321633, -0.08192319],
       [-2.0302589 ,  1.55654065],
       [-0.46852129, -0.08192319],
       [ 1.09321633, -0.08192319],
       [ 1.09321633, -0.90115511],
       [-0.46852129, -1.72038703],
       [ 1.09321633, -0.90115511]])

In [40]:
x=X.values; x

array([[167.,  74.],
       [167.,  70.],
       [167.,  78.],
       [170.,  70.],
       [164.,  78.],
       [167.,  70.],
       [170.,  70.],
       [170.,  66.],
       [167.,  62.],
       [170.,  66.]])

In [41]:
x_std = StandardScaler().fit_transform(x);x_std

array([[-0.46852129,  0.73730873],
       [-0.46852129, -0.08192319],
       [-0.46852129,  1.55654065],
       [ 1.09321633, -0.08192319],
       [-2.0302589 ,  1.55654065],
       [-0.46852129, -0.08192319],
       [ 1.09321633, -0.08192319],
       [ 1.09321633, -0.90115511],
       [-0.46852129, -1.72038703],
       [ 1.09321633, -0.90115511]])