In [11]:
import pandas as pd
from io import StringIO

csv_data = \
'''A,B,C,D
 1.0,2.0,3.0,4.0
 5.0,6.0,,.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,0.0
2,10.0,11.0,12.0,


In [12]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [13]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [14]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [15]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,0.0
2,10.0,11.0,12.0,


In [16]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [17]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [18]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  0. ],
       [10. , 11. , 12. ,  2. ]])

In [20]:
from sklearn.preprocessing import FunctionTransformer
ftr_imr = FunctionTransformer(lambda X: imr.fit_transform(X.T).T, validate=False)
imputed_data = ftr_imr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  3.66666667,  0.        ],
       [10.        , 11.        , 12.        , 11.        ]])

In [23]:
imr = SimpleImputer(add_indicator=True)
imputed_data = imr.fit_transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ,  0. ,  0. ],
       [ 5. ,  6. ,  7.5,  0. ,  1. ,  0. ],
       [10. , 11. , 12. ,  2. ,  0. ,  1. ]])

In [24]:
imr.indicator_.features_

array([2, 3], dtype=int64)

In [26]:
imr.indicator_.fit_transform(df.values)

array([[False, False],
       [ True, False],
       [False,  True]])

In [27]:
imr.inverse_transform(imputed_data)

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  0.],
       [10., 11., 12., nan]])

In [28]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
iimr = IterativeImputer()
iimr.fit_transform(df.values)

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  7.00047063,  0.        ],
       [10.        , 11.        , 12.        , -4.99964527]])

In [29]:
from sklearn.impute import KNNImputer
kimr = KNNImputer()
kimr.fit_transform(df.values)

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  0. ],
       [10. , 11. , 12. ,  2. ]])

In [30]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,0.0
2,10.0,11.0,12.0,2.0


In [33]:
import pandas as pd
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class.'],
    ['red', 'L', 12.5,'class.'],
    ['blue', 'XL', 15.3, 'class.']
])
df.columns = ['color', 'size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class.
1,red,L,12.5,class.
2,blue,XL,15.3,class.


In [34]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class.
1,red,2,12.5,class.
2,blue,3,15.3,class.


In [35]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [37]:
import numpy as np
class_mapping = {label:idx for idx,label in
                enumerate(np.unique(df['classlabel']))}
class_mapping

{'class.': 0}

In [38]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [39]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,12.5,0
2,blue,3,15.3,0


In [40]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)

In [41]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class.
1,red,2,12.5,class.
2,blue,3,15.3,class.


In [42]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 0, 0])

In [43]:
class_le.inverse_transform(y)

array(['class.', 'class.', 'class.'], dtype=object)

In [44]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 12.5],
       [0, 3, 15.3]], dtype=object)

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder(dtype=np.int)
col_trans = ColumnTransformer([('ord_enc', ord_enc, ['color'])])
X_trans = col_trans.fit_transform(df)
X_trans

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ord_enc = OrdinalEncoder(dtype=np.int)


array([[1],
       [2],
       [0]])

In [47]:
col_trans.named_transformers_['ord_enc'].inverse_transform(X_trans)

array([['green'],
       ['red'],
       ['blue']], dtype=object)

In [48]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()

In [49]:
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [51]:
from sklearn.compose import ColumnTransformer
X = df[['color', 'size','price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])
c_transf.fit_transform(X)

array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 2, 12.5],
       [1.0, 0.0, 0.0, 3, 15.3]], dtype=object)

In [52]:
# 판다스 get_dummies 사용
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,12.5,2,0,0,1
2,15.3,3,1,0,0


In [54]:
pd.get_dummies(df[['price','color','size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,12.5,2,0,1
2,15.3,3,0,0


In [56]:
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing','passthrough', [1, 2])
])
c_transf.fit_transform(X)

array([[1.0, 0.0, 1, 10.1],
       [0.0, 1.0, 2, 12.5],
       [0.0, 0.0, 3, 15.3]], dtype=object)

In [57]:
# 수치적 크기에 대한 불확신 + 두 범주 사이 순서 정의가 힘들다면 임계 값 사용
df = pd.DataFrame([['green', 'M, 1-.1, class2'],
                  ['red','L',13.5,'class1'],
                  ['blue', 'XL',15.3,'class2']])
df.columns = ['colr', 'size', 'price', 'classlabel']
df

Unnamed: 0,colr,size,price,classlabel
0,green,"M, 1-.1, class2",,
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [58]:
# 판다스 df의 apply 메소드 사용-> 임계값 기준 특성 인코딩하는 lambda 함수 적용 가능
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
del df['size']
df

Unnamed: 0,colr,price,classlabel,x > M,x > L
0,green,,,0,0
1,red,13.5,class1,1,0
2,blue,15.3,class2,1,1


In [61]:
df_wine = pd.read_csv('C:/Users/dlsgu/OneDrive/바탕 화면/머신러닝 공부/ch04/wine.data', header=None)

In [62]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
                  'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocanins', 'Color intensity', 'Hue', 'OD280/0D315 of diluted wines', 'Proline']
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocanins,Color intensity,Hue,OD280/0D315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [63]:
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=0,
                                                   stratify=y)

In [66]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [68]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [69]:
from sklearn.preprocessing import RobustScaler
rbs = RobustScaler()

In [70]:
X_train_robust = rbs.fit_transform(X_train)
X_test_robust = rbs.fit_transform(X_test)

In [73]:
ex = np.array([0,1,2,3,4,5])


In [74]:
(ex - np.percentile(ex, 50)) / (np.percentile(ex, 75) - np.percentile(ex, 25))

array([-1. , -0.6, -0.2,  0.2,  0.6,  1. ])

In [75]:
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler()
X_train_maxabs = mas.fit_transform(X_train)
X_test_maxabs = mas.transform(X_test)

In [76]:
ex / np.max(np.abs(ex))

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])