# Scikit-learn 흐름

### 1. 모델 불러오기 및 정의
```python
from sklearn.svm import SVC
clf = SVC(C=1.0, kernel='rbf' random_state=0)
```
- hyper-parameter 세팅

### 2. fit
```python
clf.fit(x_train, y_train)
```
- 훈련 데이터로 모델 학습 또는 특징 추출

### 3. predict or predict_proba or transform
```python
y_pred = clf.predict(x_test)
```

### 4. scoring
```python
accuracy_score(y_test, y_pred)
```
- 정확도, AUC, R^2등 적절한 스코어 함수로 결과 확인

In [1]:
# 실습

In [6]:
import os
from os.path import join
import copy
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import sklearn
import matplotlib.pyplot as plt
# /c/Users/82106/Desktop/ML_data

abalone_path = join('C:\\Users\\82106\\Desktop\\ML_data', 'abalone.txt')
column_path = join('C:\\Users\\82106\\Desktop\\ML_data', 'abalone_attributes.txt')
print(column_path)
abalone_columns = list()
for i in open(column_path):
    abalone_columns.append(i.strip())

C:\Users\82106\Desktop\ML_data\abalone_attributes.txt


`전복 데이터`!

전복 데이터셋은 수컷, 암컷, 유아기 3개의 범주로 이루어진 범주형 변수와 길이, 직경, 높이, 무게 등 여러 수치형 변수로 이루어져 있음

데이터를 불러온 후 입력으로 사용할 변수들과 레이블로 사용할 성별 변수로 나누기!

In [7]:
data = pd.read_csv(abalone_path, header=None, names=abalone_columns)
label = data['Sex']

In [8]:
data.head(7)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [9]:
data.shape

(4177, 9)

In [10]:
# 우리는 성별(Sex)를 label로 쓸 것! => 빼자!
del data['Sex']
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [11]:
data.shape

(4177, 8)

In [12]:
# 각 변수별 기초 통계량
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


## Scaling이란?

변수의 크기가 너무 작거나 큰 경우 모델이 학습했을 때 영향력이 제대로 표현되지 않을 수 있음!

Sklearn의 대표적인 스케일링 함수로는 특정 변수의 최대, 최소 값으로 조절하는 `Min-Max` 스케일링과 z-정규화를 이용한 `Standard` 스케일링이 있음

- Min-Max: 값이 0~1 사이로 변경
- Standard: 값이 0을 기준으로 -1~1사이로 정규화

In [13]:
# data = (data-np.min(data)) / (np.max(data) - np.min(data)) 와 같은 것임! (Min-Max Scaling)
from sklearn.preprocessing import MinMaxScaler

mMscaler = MinMaxScaler()

In [15]:
# 데이터에서 특징 찾기(Min, Max 값)
mMscaler.fit(data)
# 0~1까지 바운딩

MinMaxScaler(copy=True, feature_range=(0, 1))

In [16]:
# 데이터 변환
mMscaled_data = mMscaler.transform(data)

In [17]:
mMscaled_data.min()

0.0

In [18]:
mMscaled_data.max()

1.0

In [19]:
# 위 과정을 한번에 하기! (fit -> transform)
mMscaled_data = mMscaler.fit_transform(data)

In [20]:
# 결과 살펴보기
# 위의 경우 numpy로 반환되기 때문에 csv형태로 볼려면 pandas로 converting 해줘야 함!
mMscaled_data[:5]

array([[0.51351351, 0.5210084 , 0.0840708 , 0.18133522, 0.15030262,
        0.1323239 , 0.14798206, 0.5       ],
       [0.37162162, 0.35294118, 0.07964602, 0.07915707, 0.06624075,
        0.06319947, 0.06826109, 0.21428571],
       [0.61486486, 0.61344538, 0.11946903, 0.23906499, 0.17182246,
        0.18564845, 0.2077728 , 0.28571429],
       [0.49324324, 0.5210084 , 0.11061947, 0.18204356, 0.14425017,
        0.14944042, 0.15296462, 0.32142857],
       [0.34459459, 0.33613445, 0.07079646, 0.07189658, 0.0595158 ,
        0.05134957, 0.0533134 , 0.21428571]])

In [21]:
mMscaled_data = pd.DataFrame(mMscaled_data, columns=data.columns)
mMscaled_data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0.5
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0.214286
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,0.285714
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0.321429
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0.214286


### Standard Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
sdscaler = StandardScaler()

In [23]:
sdscaler.fit(data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [24]:
sdscaled_data = sdscaler.transform(data)

In [25]:
# 기존 data (비교위해)
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [26]:
sdscaled_data = pd.DataFrame(sdscaled_data, columns=data.columns)

In [27]:
sdscaled_data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212,-0.638217,1.571544
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221,-1.212987,-0.910013
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669,-0.207139,-0.289624
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076,-0.602294,0.020571
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337,-1.320757,-0.910013


In [28]:
# 일단 standard 써보고 성능별로면 min-max쓰라는데 ㅋㅋㅋㅋㅋ 이게 뭐여

## Sampling
예를들어 타겟 클래스가 한 쪽에 너무 치우쳐져 있으면(비율이 많으면) 머신러닝 분류 알고리즘이 잘 안먹음.

그래서 sampling을 해서 많은 쪽 비율을 줄이던지, 그런식으로 한다함

이미지같은 경우는 살짝씩 돌리면 다른 이미지로 받아들여서 여러가지 방법으로 샘플 데이터를 쉽게, 많이 만들어낼 수 있음.

근데 `정형 데이터`는 상대적으로 샘플링이 어려움.

우리는 Smote라는 걸 써볼껀데, 뭐... 샘플링하는 알고리즘이고 잘 된다함(?)ㅋㅋㅋ

#### 샘플링은 크게 두 가지로 나뉨
- 적은 클래스의 데이터 수를 증가 시키는 Oversampling
- 많은 클래스의 데이터 수를 감소 시키는 Undersampling

위 두 가지 방법이 있긴 한데

가장 쉬운건 랜덤하게 아무거나 뽑아서 복제하거나, 제거하는 것

#### => Random Over, Under Sampling
- Random Over Sampling
문제점: 데이터를 똑같이 복제하기 때문에 데이터의 분산이 안됨

ex) 시각화 했을 때 한 점인줄 알았는데 복제되서 10개 찍혀있는 ㄷㄷ...

- Random Under Sampling
문제점: 데이터를 랜덤하게 아예 제거하기 때문에 당연히 데이터 손실 가능성 있음

`결론` 데이터 진~ ~ ~짜 많으면 써볼만 한데 그냥 `Smote`쓰래 ㅋㅋㅋㅋㅋ 의식의 흐름인가

그래도 일단 써보자.

In [29]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

ros = RandomOverSampler()
rus = RandomUnderSampler()

ImportError: cannot import name '_to_object_array' from 'sklearn.utils' (C:\Users\82106\Anaconda3\lib\site-packages\sklearn\utils\__init__.py)

In [None]:
# 데이터에서 특징 학습(데이터 비율)과 동시에 + 데이터 샘플링
# Over 샘플링

oversampled_data, oversampled_label = ros.fit_resample(data, label)
oversampled_data = pd.DataFrame(oversampled_data, columns=data.columns)

# Under 샘플링
undersampled_data, undersampled_label = rus.fit_resample(data, label)
undersampled_data = pd.DataFrame(oversampled_data, columns=data.columns)

In [None]:
print('원본 데이터의 클래스 비율 \n{}'.format(pd.get_dummies(label).sum()))
print('\nRandom Over 샘플링 결과 \n{}'.format(pd.get_dummies(oversampled_label).sum()))
print('\nRandom Under 샘플링 결과 \n{}'.format(pd.get_dummies(undersampled_label).sum()))
print(1)

In [None]:
######### 내일 하자@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

In [30]:
import sklearn
sklearn.__version__

'0.21.1'