# 데이터 전처리

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl

In [26]:
df = pd.DataFrame([['green', 'L', 10.1, 'yes'], ['green','M',13.5,'no'],
                   ['red','M', 15.3, 'yes'],['blue','S',12.7,'yes']])
df.columns = ['color','size','price','label']

In [27]:
df

Unnamed: 0,color,size,price,label
0,green,L,10.1,yes
1,green,M,13.5,no
2,red,M,15.3,yes
3,blue,S,12.7,yes


In [28]:
X = df.drop('label',  axis = 1)
y = df['label']

In [29]:
y1 = np.where(y=='no', 0,1)

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
encoder = LabelEncoder()

In [32]:
encoder.classes_

AttributeError: 'LabelEncoder' object has no attribute 'classes_'

In [33]:
y2 = LabelEncoder().fit_transform(y)

In [34]:
size_mapping = {'XS':0, 'S':1, 'M':2, 'L':3, 'XL':4, 'XXL':5}
X['size2'] = X['size'].map(size_mapping)

In [35]:
X

Unnamed: 0,color,size,price,size2
0,green,L,10.1,3
1,green,M,13.5,2
2,red,M,15.3,2
3,blue,S,12.7,1


In [36]:
X = X.drop('size', axis=1)

In [37]:
X

Unnamed: 0,color,price,size2
0,green,10.1,3
1,green,13.5,2
2,red,15.3,2
3,blue,12.7,1


In [38]:
X = pd.get_dummies(X)

In [39]:
X 

Unnamed: 0,price,size2,color_blue,color_green,color_red
0,10.1,3,0,1,0
1,13.5,2,0,1,0
2,15.3,2,0,0,1
3,12.7,1,1,0,0


In [43]:
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(path, header=None)

In [44]:
wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [45]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       178 non-null    int64  
 1   1       178 non-null    float64
 2   2       178 non-null    float64
 3   3       178 non-null    float64
 4   4       178 non-null    float64
 5   5       178 non-null    int64  
 6   6       178 non-null    float64
 7   7       178 non-null    float64
 8   8       178 non-null    float64
 9   9       178 non-null    float64
 10  10      178 non-null    float64
 11  11      178 non-null    float64
 12  12      178 non-null    float64
 13  13      178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


데이터프레임의 인덱서  
df.loc() => 인덱스와 칼럼 이름을 사용  
df.iloc() => 행번호와 열번호(넘파이 배열처럼)

In [48]:
X = wine.iloc[:, 1:].values

In [50]:
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [51]:
y = wine.iloc[:,0].values

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [55]:
pd.DataFrame(X_test_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,-0.131835,-0.049174,-0.226279,-0.303835,0.153107,-0.159365,-0.112663,-0.168936,-0.279228,-0.044005,0.048611,-0.036483,-0.080326
std,0.945881,0.841129,1.059069,0.979626,1.04497,1.049986,0.925997,1.078199,0.864142,0.938784,1.112748,1.079511,0.885251
min,-2.433355,-1.243398,-3.823226,-2.765706,-1.286924,-1.992071,-1.563706,-1.634445,-2.085853,-1.614162,-1.876049,-1.918405,-1.351669
25%,-0.815004,-0.642564,-0.689799,-1.112585,-0.717763,-1.016234,-0.928557,-0.953648,-0.844563,-0.777336,-0.788992,-1.139126,-0.767758
50%,-0.156125,-0.400947,-0.260818,-0.316081,-0.077456,-0.178641,-0.004034,-0.602934,-0.30653,-0.207447,0.139535,0.195751,-0.260678
75%,0.566516,0.569796,0.457259,0.472909,0.829644,0.508511,0.662995,0.511097,0.269632,0.491674,0.784974,0.946168,0.396222
max,1.647442,1.929691,1.995996,2.163601,4.475833,2.480514,1.653729,2.409076,2.77763,2.368706,3.42335,1.876974,2.228625


In [56]:
from sklearn.decomposition import PCA

In [57]:
pca = PCA(n_components=2)

In [58]:
pca.fit(X_train_std)

PCA(n_components=2)

In [59]:
X_train_pca = pca.transform(X_train_std)

In [61]:
X_train_pca.shape

(124, 2)

In [62]:
pca.components_

array([[-0.13724218,  0.24724326, -0.02545159,  0.20694508, -0.15436582,
        -0.39376952, -0.41735106,  0.30572896, -0.30668347,  0.07554066,
        -0.32613263, -0.36861022, -0.29669651],
       [ 0.50303478,  0.16487119,  0.24456476, -0.11352904,  0.28974518,
         0.05080104, -0.02287338,  0.09048885,  0.00835233,  0.54977581,
        -0.20716433, -0.24902536,  0.38022942]])

In [63]:
pca.explained_variance_ratio_

array([0.36951469, 0.18434927])