# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [2]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [3]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 3.데이터 준비

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [5]:
target = 'Sales'
x = data.drop([target], axis=1)

In [12]:
y = data.loc[:, [target]]
y

Unnamed: 0,Sales
0,9.50
1,11.22
2,10.06
3,7.40
4,4.15
...,...
395,12.57
396,6.14
397,7.41
398,5.94


### (3) NA 조치

In [14]:
x.isna().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [15]:
y.isna().sum()

Sales    0
dtype: int64

### (4) 가변수화

In [17]:
cols = ['ShelveLoc', 'Urban', 'US']

In [25]:
x = pd.get_dummies(data=x, columns=cols, drop_first=True)

In [26]:
x

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes
0,138,73,11,276,120,42,17,0,0,1,1
1,111,48,16,260,83,65,10,1,0,1,1
2,113,35,10,269,80,59,12,0,1,1,1
3,117,100,4,466,97,55,14,0,1,1,1
4,141,64,3,340,128,38,13,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,138,108,17,203,128,33,14,1,0,1,1
396,139,23,3,37,120,55,11,0,1,0,1
397,162,26,12,368,159,40,18,0,1,1,1
398,100,79,7,284,95,50,12,0,0,1,1


### (5) 데이터분할2 : train : validation 나누기

In [27]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [30]:
# minmax scale
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

x_train_minmax = scaler.fit_transform(x_train)
x_val_minmax = scaler.transform(x_val)

In [31]:
# standard scale
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
x_train_standard = scaler2.fit_transform(x_train)
x_val_standard = scaler2.transform(x_val)

In [32]:
x_train_minmax = pd.DataFrame(x_train_minmax, columns = list(x))
x_train_standard = pd.DataFrame(x_train_standard, columns=list(x))

In [33]:
x_train_minmax.describe()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes
count,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0
mean,0.487318,0.486941,0.232143,0.500995,0.550192,0.515844,0.490625,0.217857,0.532143,0.714286,0.639286
std,0.162398,0.282667,0.235118,0.297077,0.145616,0.29391,0.328717,0.413529,0.499859,0.452563,0.481068
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.387755,0.232323,0.0,0.245992,0.449102,0.268182,0.25,0.0,0.0,0.0,0.0
50%,0.479592,0.484848,0.172414,0.517034,0.562874,0.545455,0.5,0.0,1.0,1.0,1.0
75%,0.591837,0.719697,0.413793,0.753006,0.646707,0.745455,0.75,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
x_train_standard.describe()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes
count,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0
mean,2.521792e-16,1.007131e-16,6.423433000000001e-17,0.0,1.9191e-16,-7.375053e-17,-2.724011e-16,-5.709718000000001e-17,1.316407e-16,1.165734e-16,1.94289e-16
std,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791,1.001791
min,-3.006128,-1.725753,-0.9891157,-1.689434,-3.785145,-1.758249,-1.495219,-0.5277678,-1.066492,-1.581139,-1.331269
25%,-0.6141746,-0.9023829,-0.9891157,-0.85991,-0.6954709,-0.8441544,-0.7333238,-0.5277678,-1.066492,-1.581139,-1.331269
50%,-0.04765923,-0.007415444,-0.254494,0.054087,0.0872465,0.1009267,0.02857106,-0.5277678,0.9376538,0.6324555,0.751163
75%,0.6447485,0.8249043,0.7739765,0.849821,0.6639856,0.7826246,0.7904659,-0.5277678,0.9376538,0.6324555,0.751163
max,3.162595,1.818318,3.27169,1.682724,3.094529,1.65024,1.552361,1.894773,0.9376538,0.6324555,0.751163


## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [35]:
from sklearn import linear_model

* 모델1

In [36]:
features = ['Price', 'CompPrice', 'Income']
x_lin = x[features]

In [39]:
x_train_lin, x_val_lin, y_train_lin, y_val_lin = train_test_split(x_lin, y, test_size=0.3)

In [40]:
model1 = linear_model.LinearRegression()
model1.fit(x_train_lin, y_train_lin)

LinearRegression()

In [41]:
pred1 = model1.predict(x_val_lin)

In [42]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

* 모델2

In [43]:
features = ['Population', 'Age' ,'Education']

In [44]:
x_lin2 = x[features]

In [45]:
x_train_lin2, x_val_lin2, y_train_lin2, y_val_lin2 = train_test_split(x_lin2, y, test_size=0.3)

In [46]:
model2 = linear_model.LinearRegression()
model2.fit(x_train_lin2, y_train_lin2)

LinearRegression()

In [47]:
pred2 = model2.predict(x_val_lin2)

In [49]:
print('[[Price, CompPrice, Income]]')
print(mean_squared_error(y_val_lin, pred1, squared=False))
print(mean_absolute_error(y_val_lin, pred1))
print(mean_absolute_percentage_error(y_val_lin, pred1))
print('[[Population, Age, Education]]')
print(mean_squared_error(y_val_lin2, pred2, squared=False))
print(mean_absolute_error(y_val_lin2, pred2))
print(mean_absolute_percentage_error(y_val_lin2, pred2))



[[Price, CompPrice, Income]]
2.2366156419132284
1.8180469689601282
0.3084501677905453
[[Population, Age, Education]]
2.818696426010272
2.2724084358410344
0.5577407624390166


## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

* 모델3

In [53]:
from sklearn.neighbors import KNeighborsRegressor    
from sklearn.metrics import *

In [51]:
#K값이 5
model3 = KNeighborsRegressor()

In [54]:
model3.fit(x_train_minmax, y_train)

KNeighborsRegressor()

In [55]:
pred3 = model3.predict(x_val_minmax)

In [70]:
model8 = KNeighborsRegressor()
model8.fit(x_train_standard, y_train)
pred8 = model8.predict(x_val_standard)

* 모델4

In [56]:
model4 = KNeighborsRegressor(n_neighbors=10)
model4.fit(x_train_minmax, y_train)
pred4 = model4.predict(x_val_minmax)

In [66]:
model6 = KNeighborsRegressor(n_neighbors=10)
model6.fit(x_train_standard, y_train)
pred6 = model6.predict(x_val_standard)

* 모델5

In [58]:
model5 =KNeighborsRegressor(n_neighbors=20)
model5.fit(x_train_minmax, y_train)
pred5 = model5.predict(x_val_minmax)

In [68]:
model7 = KNeighborsRegressor(n_neighbors=20)
model7.fit(x_train_standard, y_train)
pred7 = model7.predict(x_val_standard)

## 6.성능비교

In [71]:
print('linear regression - 1: Price, CompPrice, Income')
print('mse:', mean_squared_error(y_val_lin, pred1, squared=False))
print('mae:',mean_absolute_error(y_val_lin, pred1))
print('mape:',mean_absolute_percentage_error(y_val_lin, pred1))

print('linear regression - 2: ShelveLoc, Urban, US')
print('mse:', mean_squared_error(y_val_lin2, pred2, squared=False))
print('mae:',mean_absolute_error(y_val_lin2, pred2))
print('mape:',mean_absolute_percentage_error(y_val_lin2, pred2))

print('KNN - 1: K = 5')
print('mse:', mean_squared_error(y_val, pred3, squared=False))
print('mae:',mean_absolute_error(y_val, pred3))
print('mape:',mean_absolute_percentage_error(y_val, pred3))

print('KNN - 2: K = 10')
print('mse:',mean_squared_error(y_val, pred4, squared=False))
print('mae:',mean_absolute_error(y_val, pred4))
print('mape:',mean_absolute_percentage_error(y_val, pred4))

print('KNN - 3: K = 20')
print('mse:',mean_squared_error(y_val, pred5, squared=False))
print('mae:',mean_absolute_error(y_val, pred5))
print('mape:',mean_absolute_percentage_error(y_val, pred5))

print('KNN - 4: K = 10 (scaling: standard)')
print('mse:',mean_squared_error(y_val, pred6, squared=False))
print('mae:',mean_absolute_error(y_val, pred6))
print('mape:',mean_absolute_percentage_error(y_val, pred6))

print('KNN - 5: K = 20 (scaling: standard)')
print('mse:',mean_squared_error(y_val, pred7, squared=False))
print('mae:',mean_absolute_error(y_val, pred7))
print('mape:',mean_absolute_percentage_error(y_val, pred7))

print('KNN - 6: K = 5 (scaling: standard)')
print('mse:',mean_squared_error(y_val, pred8, squared=False))
print('mae:',mean_absolute_error(y_val, pred8))
print('mape:',mean_absolute_percentage_error(y_val, pred8))


linear regression - 1: Price, CompPrice, Income
mse: 2.2366156419132284
mae: 1.8180469689601282
mape: 0.3084501677905453
linear regression - 2: ShelveLoc, Urban, US
mse: 2.818696426010272
mae: 2.2724084358410344
mape: 0.5577407624390166
KNN - 1: K = 5
mse: 1.984826860627059
mae: 1.5332666666666668
mape: 0.2533602833763155
KNN - 2: K = 10
mse: 1.9927687794456568
mae: 1.5151916666666665
mape: 0.24401199855977673
KNN - 3: K = 20
mse: 2.031220384399487
mae: 1.5745833333333334
mape: 0.2551856087271648
KNN - 4: K = 10 (scaling: standard)
mse: 1.6681432767201583
mae: 1.3109083333333333
mape: 0.21497315373610187
KNN - 5: K = 20 (scaling: standard)
mse: 1.7305558385722972
mae: 1.3850291666666665
mape: 0.22638777293204596
KNN - 6: K = 5 (scaling: standard)
mse: 1.7070077719018544
mae: 1.3467333333333333
mape: 0.22844353940613046
