# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [85]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [86]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 3.데이터 준비

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [87]:
data['diff_price'] = data['CompPrice'] - data['Price']
data = data.drop(['CompPrice', 'Price'], axis = 1)
data.head()

Unnamed: 0,Sales,Income,Advertising,Population,ShelveLoc,Age,Education,Urban,US,diff_price
0,9.5,73,11,276,Bad,42,17,Yes,Yes,18
1,11.22,48,16,260,Good,65,10,Yes,Yes,28
2,10.06,35,10,269,Medium,59,12,Yes,Yes,33
3,7.4,100,4,466,Medium,55,14,Yes,Yes,20
4,4.15,64,3,340,Bad,38,13,Yes,No,13


In [88]:
target = 'Sales'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   Income       400 non-null    int64  
 2   Advertising  400 non-null    int64  
 3   Population   400 non-null    int64  
 4   ShelveLoc    400 non-null    object 
 5   Age          400 non-null    int64  
 6   Education    400 non-null    int64  
 7   Urban        400 non-null    object 
 8   US           400 non-null    object 
 9   diff_price   400 non-null    int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 31.4+ KB


In [90]:
columns = ['ShelveLoc', 'Education', 'Urban', 'US']
x = pd.get_dummies(x, columns = columns, drop_first = True)

In [91]:
x.head()

Unnamed: 0,Income,Advertising,Population,Age,diff_price,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,Urban_Yes,US_Yes
0,73,11,276,42,18,0,0,0,0,0,0,0,0,1,0,1,1
1,48,16,260,65,28,1,0,0,0,0,0,0,0,0,0,1,1
2,35,10,269,59,33,0,1,0,1,0,0,0,0,0,0,1,1
3,100,4,466,55,20,0,1,0,0,0,1,0,0,0,0,1,1
4,64,3,340,38,13,0,0,0,0,1,0,0,0,0,0,1,0


### (5) 데이터분할2 : train : validation 나누기

In [92]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [93]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [94]:
scaler = MinMaxScaler()
x_train_s1 = scaler.fit_transform(x_train) # fit + transform
x_val_s1 = scaler.transform(x_val) # transfrom 만 위에서 fit함

In [95]:
scaler = StandardScaler()
x_train_s2 = scaler.fit_transform(x_train)
x_val_s2 = scaler.transform(x_val)

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [96]:
from sklearn.linear_model import LinearRegression

* 모델1

In [97]:
model1 = LinearRegression()

In [98]:
model1.fit(x_train, y_train)

LinearRegression()

In [99]:
pred1 = model1.predict(x_val)

In [100]:
pred1[:20]

array([ 6.38805101,  7.42207629, 10.4807706 ,  8.00499701,  5.09584454,
        5.82378264,  5.08058198,  4.90537678,  4.11254876,  5.25591875,
        5.76921809,  1.09667396,  4.89229384,  6.90946116,  8.68972637,
        7.88925304,  3.96566464,  7.00689244,  3.97664379,  6.31628612])

* 모델2

In [101]:
feature = ['diff_price']

In [102]:
x_train2 = x_train[feature]
x_val2 = x_val[feature]

In [103]:
model2 = LinearRegression()

In [104]:
model2.fit(x_train2, y_train)

LinearRegression()

In [105]:
pred2 = model2.predict(x_val2)

In [106]:
pred2[:20]

array([ 7.50850627,  6.41497658,  7.4243886 ,  7.59262394,  6.83556492,
        7.59262394,  7.1720356 ,  5.91027058,  7.1720356 ,  7.34027093,
        7.08791793,  3.63909354,  6.75144726,  3.63909354, 10.45262465,
        8.18144761,  7.08791793,  6.49909425,  5.99438824,  7.84497694])

## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

In [107]:
from sklearn.neighbors import KNeighborsRegressor

* 모델3

In [108]:
model3 = KNeighborsRegressor(n_neighbors=20)

In [109]:
model3.fit(x_train_s1, y_train)

KNeighborsRegressor(n_neighbors=20)

In [110]:
pred3 = model3.predict(x_val_s1)

In [111]:
pred3[:20]

array([ 7.4135,  7.919 , 10.215 ,  7.225 ,  6.339 ,  7.1525,  6.7665,
        8.094 ,  6.903 ,  6.5255,  7.0655,  6.5555,  6.7025,  9.5555,
        7.4385,  7.8185,  5.8065,  7.2105,  6.997 ,  7.2955])

* 모델4

In [112]:
model4 = KNeighborsRegressor(metric='euclidean')

In [113]:
model4.fit(x_train_s1, y_train)

KNeighborsRegressor(metric='euclidean')

In [114]:
pred4 = model4.predict(x_val_s1)

In [115]:
pred4[:20]

array([ 6.906,  8.618, 11.212,  8.862,  5.896,  8.798,  5.216, 10.136,
        6.296,  6.2  ,  7.442,  6.41 ,  4.836, 10.912,  7.38 ,  7.08 ,
        4.984,  7.194,  5.22 ,  6.046])

* 모델5

In [116]:
model5 = KNeighborsRegressor(metric='manhattan')

In [117]:
model5.fit(x_train_s1, y_train)

KNeighborsRegressor(metric='manhattan')

In [118]:
pred5 = model5.predict(x_val_s1)

In [119]:
pred5[:20]

array([ 6.906,  8.618, 11.212,  8.862,  5.896,  6.24 ,  5.216, 10.136,
        6.36 ,  7.114,  7.442,  5.648,  4.836,  9.346,  7.38 ,  7.922,
        4.784,  7.194,  5.22 ,  6.046])

## 6.성능비교

In [120]:
# 회귀모델 평가용
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [121]:
model_no, r2score, rmse, mae, mape = [], [], [], [], []
pred = [pred1, pred2, pred3, pred4, pred5]

for i, p in enumerate(pred):
    model_no.append(i+1)
    r2score.append(r2_score(y_val, p))
    rmse.append(mean_squared_error(y_val, p, squared = False))
    mae.append(mean_absolute_error(y_val, p)) 
    mape.append(mean_absolute_percentage_error(y_val, p)) 

result = pd.DataFrame({'model_no':model_no, 'r^2 score':r2score, 'rmse':rmse, 'mae':mae, 'mape':mape}) 
result

Unnamed: 0,model_no,r^2 score,rmse,mae,mape
0,1,0.874949,1.043009,0.830455,0.216538
1,2,0.38133,2.319929,1.966878,0.536356
2,3,0.352497,2.373373,1.957292,0.751224
3,4,0.324468,2.424197,1.994967,0.741541
4,5,0.366133,2.348249,1.935,0.703792
