# 머신러닝 복습

# 1.환경준비

* 라이브러리 Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 2.Regression : Carseat

## (1) 데이터 전처리
* 데이터 준비
* 가변수화
* 스케일링(필요하다면)
* 데이터 분할

### 1) 데이터 준비

* 카시트 판매량 데이터

|	변수명	|	설명	|	구분	|
|	----	|	----	|	----	|
|	**Sales** 	|	 **각 지역 판매액(단위 : 1000달러)**	|	**Target**	|
|	CompPrice 	|	지역별 경쟁사 판매가격(달러)	|	feature	|
|	Income 	|	가구당 평균 소득액(1000달러)	|	feature	|
|	Advertising 	|	 각 지역, 회사의 광고 예산(1000달러)	|	feature	|
|	Population 	|	 지역 인구수(단위 : 1000명)	|	feature	|
|	Price 	|	 자사 지역별 판매가격(달러)	|	feature	|
|	ShelveLoc 	|	 진열상태(범주 : Bad, Medium, Good)	|	feature	|
|	Age 	|	 지역 인구의 평균 연령	|	feature	|
|	Education 	|	 교육수준(범주 : 10~18)	|	feature	|
|	Urban 	|	 매장이 도심에 있는지 여부(범주 : Yes, No)	|	feature	|
|	US 	|	 매장이 미국에 있는지 여부(범주 : Yes, No)	|	feature	|


* 데이터 경로 : https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
carseat = pd.read_csv(path)
carseat.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


* 데이터를 x, y로 나누기

In [3]:
target = 'Sales'
x = carseat.drop(target, axis=1)
y = carseat.loc[:, target]

### 2) 가변수화

In [4]:
cat_cols = ['ShelveLoc', 'Education', 'US', 'Urban']
x = pd.get_dummies(x, columns = cat_cols, drop_first = True)

In [5]:
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,US_Yes,Urban_Yes
0,138,73,11,276,120,42,0,0,0,0,0,0,0,0,1,0,1,1
1,111,48,16,260,83,65,1,0,0,0,0,0,0,0,0,0,1,1
2,113,35,10,269,80,59,0,1,0,1,0,0,0,0,0,0,1,1
3,117,100,4,466,97,55,0,1,0,0,0,1,0,0,0,0,1,1
4,141,64,3,340,128,38,0,0,0,0,1,0,0,0,0,0,0,1


### 3) 데이터분할
* train : val 로 분할

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2, 
                                                  random_state = 20)

### 4) Scaling

## (2) 모델링
* 필요한 함수들 불러오기
* 모델 선언
* 학습
* 예측
* 성능 검증

In [7]:
from sklearn.ensemble import RandomForestRegressor

### 1) 모델 선언

In [8]:
model = RandomForestRegressor()

### 2) 학습

In [9]:
model.fit(x_train, y_train)

### 3) 예측

In [10]:
pred = model.predict(x_val)

In [11]:
pred

array([ 7.8826,  7.0198,  6.83  ,  4.545 ,  5.6473,  6.9882,  6.1979,
       10.0474,  5.6955,  7.5597,  5.8406, 11.2538,  8.2964,  5.2162,
        6.5001,  7.0082,  5.1573, 11.2556,  9.0454,  4.0754,  6.2826,
        8.1546,  5.7477,  6.1501,  5.3503, 10.0858,  7.1679, 10.5415,
        9.4904,  9.626 ,  3.9551,  5.2539,  8.085 ,  7.9464,  7.7733,
        9.2803,  5.2888,  7.2603, 12.1909,  7.4505,  4.7675,  8.5834,
        5.9322,  7.1398,  9.0334,  7.6178,  6.9843,  7.7578,  6.0482,
        5.713 ,  9.1352,  9.0858,  7.6596,  8.7315,  9.0484,  8.7204,
        6.9803,  5.469 , 12.4627,  7.4259,  7.3222,  5.0969,  4.2023,
        6.2691,  5.9619,  9.9153,  8.7346,  6.2921,  7.6805,  5.7502,
        8.978 ,  9.4703,  8.73  ,  5.8477,  5.4343,  4.917 ,  6.7743,
        4.5603,  6.3167,  6.3334])

### 4) 검증
만든 모델은 얼마나 정확한지 검증해 봅시다.



In [12]:
print(f'RMSE  : {mean_squared_error(y_val, pred, squared=False)}')
print(f'MAE   : {mean_absolute_error(y_val, pred)}')
# print(f'MAPE  : {mean_absolute_percentage_error(y_val, pred)}')

RMSE  : 1.7852867782586077
MAE   : 1.4074287499999998


# 3.Regression : Advertising

## (1) 데이터 전처리

### 1) 데이터 준비

In [13]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/advertising.csv'
adv = pd.read_csv(path)
adv.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [15]:
target = 'Sales'
x = adv.drop(target, axis=1)
y = adv.loc[:, target]

### 2) 가변수화

### 3) 데이터분할

In [16]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2, random_state = 20)

### 4) Scaling

In [17]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## (2) 모델링
* 필요한 함수들 불러오기
* 모델 선언
* 학습
* 예측
* 성능 검증

In [18]:
from sklearn.neighbors import KNeighborsRegressor

### 1) 모델 선언

In [19]:
model = KNeighborsRegressor()

### 2) 학습

In [20]:
model.fit(x_train_s, y_train)

### 3) 예측

In [21]:
pred = model.predict(x_val_s)

### 4) 검증
만든 모델은 얼마나 정확한지 검증해 봅시다.



In [22]:
mean_squared_error(y_val, pred, squared = False)

1.2493598360760603

In [24]:
mean_absolute_error(y_val, pred)

0.8039999999999999

# 4.Classification : mobile

## (1) 데이터 전처리

### 1) 데이터 준비

In [25]:
path = "https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_churn_simple.csv"
data = pd.read_csv(path)
data['CHURN'] = data['CHURN'].map({'STAY':0, 'LEAVE':1})
data.head()

Unnamed: 0,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,CHURN
0,31953,0,6,313378,161,0,4,0
1,36147,0,13,800586,244,0,6,0
2,27273,230,0,305049,201,16,15,0
3,120070,38,33,788235,780,3,2,1
4,29215,208,85,224784,241,21,1,0


|	구분	|	변수 명	|	내용	|	type	|	비고	|
|	----	|	----	|	----	|	----	|	----	|
|	**Target**	|	**CHURN**	|	이탈여부	|	범주	| 0,1	|
|	feature	|	INCOME	|	소득수준(달러)	|	숫자	|		|
|	feature	|	OVERAGE	|	월평균 초과사용시간(분)	|	숫자	| |
|	feature	|	LEFTOVER	|	월평균 잔여시간(%)	|	숫자	| 	|
|	feature	|	HOUSE	|	집가격(달러)	|	숫자	|	|
|	feature	|	HANDSET_PRICE	|	휴대폰가격(달러)	|	숫자	|		|
|	feature	|	OVER_15MINS_CALLS_PER_MONTH	|	월평균 장기통화 횟수	|	숫자	| 		|
|	feature	|	AVERAGE_CALL_DURATION	|	평균통화시간(분)	|	숫자	|		|

In [26]:
target = 'CHURN'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### 2) 가변수화

### 3) 데이터분할

In [27]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2, 
                                                  random_state = 20)

### 4) Scaling

## (2) 모델링
* 필요한 함수들 불러오기
* 모델 선언
* 학습
* 예측
* 성능 검증

In [28]:
from xgboost import XGBClassifier

### 1) 모델 선언

In [29]:
model = XGBClassifier()

### 2) 학습

In [30]:
model.fit(x_train, y_train)

### 3) 예측

In [31]:
pred = model.predict(x_val)

### 4) 검증
만든 모델은 얼마나 정확한지 검증해 봅시다.



In [32]:
print(confusion_matrix(y_val, pred))
print('-'*50)
print(classification_report(y_val, pred))

[[1377  661]
 [ 566 1396]]
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.71      0.68      0.69      2038
           1       0.68      0.71      0.69      1962

    accuracy                           0.69      4000
   macro avg       0.69      0.69      0.69      4000
weighted avg       0.69      0.69      0.69      4000



# 5.Classification : 대학원 지원

## (1) 데이터 전처리

### 1) 데이터 준비

In [33]:
path = "https://raw.githubusercontent.com/DA4BAM/dataset/master/Graduate_apply.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [34]:
target = 'admit'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### 2) 가변수화

In [35]:
cat_cols = ['rank']
x = pd.get_dummies(x, columns = cat_cols, drop_first = True)

### 3) 데이터분할

In [36]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2,
                                                  random_state = 20)

### 4) Scaling

## (2) 모델링
* 필요한 함수들 불러오기
* 모델 선언
* 학습
* 예측
* 성능 검증

In [37]:
from sklearn.linear_model import LogisticRegression

### 1) 모델 선언

In [38]:
model = LogisticRegression()

### 2) 학습

In [39]:
model.fit(x_train, y_train)

### 3) 예측

In [40]:
pred = model.predict(x_val)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1])

### 4) 검증
만든 모델은 얼마나 정확한지 검증해 봅시다.



In [42]:
print(confusion_matrix(y_val, pred))
print(classification_report(y_val, pred))

[[46  9]
 [19  6]]
              precision    recall  f1-score   support

           0       0.71      0.84      0.77        55
           1       0.40      0.24      0.30        25

    accuracy                           0.65        80
   macro avg       0.55      0.54      0.53        80
weighted avg       0.61      0.65      0.62        80



In [41]:
# classification_report(y_val, pred)

'              precision    recall  f1-score   support\n\n           0       0.71      0.84      0.77        55\n           1       0.40      0.24      0.30        25\n\n    accuracy                           0.65        80\n   macro avg       0.55      0.54      0.53        80\nweighted avg       0.61      0.65      0.62        80\n'