# 3. 다중 회귀 분석 (Multiple Regression)
* 독립변수가 2개 이상인 회귀분석
* 특성이 1개면 직선형태를 학습함 (y = a * x + b)
* 특성이 2개면 평면을 학습함 (y = a * x1 + b * x2 + c)

## 2-1. 농어 길이와 높이에 따른 무게 예측

### 1) 문제 정의
농어 길이(Length), 높이(Height), 두께(Width)에 따른 무게(Weight) 예측하기

### 2) 데이터 수집
* https://www.kaggle.com/aungpyaeap/fish-market

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ori_data = pd.read_csv('data/02_fish/fish.csv')
ori_data

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [5]:
data = ori_data[ori_data.Species=='Perch'][['Length2', 'Height', 'Width', 'Weight']]
data.columns = ['Length', 'Height', 'Width', 'Weight']
data

Unnamed: 0,Length,Height,Width,Weight
72,8.4,2.112,1.408,5.9
73,13.7,3.528,1.9992,32.0
74,15.0,3.824,2.432,40.0
75,16.2,4.5924,2.6316,51.5
76,17.4,4.588,2.9415,70.0
77,18.0,5.2224,3.3216,100.0
78,18.7,5.1992,3.1234,78.0
79,19.0,5.6358,3.0502,80.0
80,19.6,5.1376,3.0368,85.0
81,20.0,5.082,2.772,85.0


### 3) 데이터 전처리
* 결측치 처리

In [6]:
data.shape

(56, 4)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 72 to 127
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Length  56 non-null     float64
 1   Height  56 non-null     float64
 2   Width   56 non-null     float64
 3   Weight  56 non-null     float64
dtypes: float64(4)
memory usage: 2.2 KB


In [8]:
data.isnull().sum()

Length    0
Height    0
Width     0
Weight    0
dtype: int64

### 4) 데이터 탐색
* 상관관계
* 산점도(Scatter Plot) -> 생략

In [10]:
data.corr()

Unnamed: 0,Length,Height,Width,Weight
Length,1.0,0.985584,0.974617,0.958656
Height,0.985584,1.0,0.982943,0.968441
Width,0.974617,0.982943,1.0,0.963943
Weight,0.958656,0.968441,0.963943,1.0


### 5) 모델 학습
* 데이터 쪼개기
* 다중회귀 학습
* 다중회귀 예측

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [25]:
# 독립변수 : Length, Height, Width
# 종속변수 : Weight
input_data = data[['Length', 'Height', 'Width']].to_numpy()
target_data = data['Weight'].to_numpy()
input_data.shape, target_data.shape

((56, 3), (56,))

In [26]:
# 훈련 세트와 테스트 세트 나누기
train_input, test_input, train_target, test_target = train_test_split(input_data, target_data, random_state=0)
train_input.shape, test_input.shape, train_target.shape, test_target.shape

((42, 3), (14, 3), (42,), (14,))

In [27]:
lr = LinearRegression()

In [28]:
lr.fit(train_input, train_target)

LinearRegression()

In [29]:
lr.coef_, lr.intercept_

(array([-7.80989989e-02,  8.51699274e+01,  5.92449041e+01]),
 -556.7642765166698)

In [30]:
lr.predict([[50, 15, 10]])

array([1309.32872567])

In [31]:
# Weight = a * Length + b * Height + c * Width + d
-7.80989989e-02 * 50 + 8.51699274e+01 * 15 + 5.92449041e+01 * 10 -556.7642765166698

1309.3287255383302

In [37]:
### 6) 모델 평가
pred = lr.predict(test_input)
pd.DataFrame({'Length': test_input[:,0], 
              'Height': test_input[:,1], 
              'Width': test_input[:,2], 
              'Actual Weight': test_target, 'Predicted Weight': pred})

Unnamed: 0,Length,Height,Width,Actual Weight,Predicted Weight
0,39.0,11.1366,6.003,650.0,744.340435
1,27.5,7.2828,4.5662,250.0,331.887629
2,36.5,10.881,6.864,685.0,773.776112
3,24.6,6.7334,4.1658,188.0,261.600099
4,21.0,5.9175,3.3075,115.0,141.54121
5,15.0,3.824,2.432,40.0,-88.162352
6,27.5,7.0516,4.335,265.0,298.49892
7,37.0,10.5717,6.3666,690.0,717.925589
8,39.0,12.4313,7.3514,820.0,934.495769
9,27.3,8.323,5.1373,300.0,454.331772


In [38]:
lr.score(train_input, train_target)

0.946573874702987

In [39]:
lr.score(test_input, test_target)

0.8974351710978307