# 다중 선형회귀 분석

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from matplotlib import font_manager, rc, cm
import matplotlib as mpl

# 마이너스 깨짐 현상 해결
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'NanumGothic'

# 선형회귀 관련 라이브러리 가져오기
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm

* 간단한 예제

In [4]:
# 데이터 설정
X = np.array([[0, 1], [1, 2], [2, 2.5]])
y = np.array([0, 1.2, 1.6])

# 선형회귀 object 생성
model = linear_model.LinearRegression()

# 훈련
model.fit(X, y)

# training data 예측
pred_train = model.predict(X)

# test
pred_test = model.predict([[1.5, 2]])

In [5]:
pred_test

array([1.])

In [6]:
model.coef_

array([-0.4,  1.6])

* 실데이터 분석하기

## 데이터 준비

In [8]:
raw = pd.read_csv('../datasets/ML_data/Advertising.csv', index_col=0)
ads = raw.copy()
print(ads.shape)
ads.head(2)

(200, 4)


Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4


In [9]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


## 데이터 전처리

In [10]:
# train, test 분리하기
train = ads[:-20]
test = ads[-20:]

# train data의 feature, label 분리하기
X_train = train[['TV', 'Radio', 'Newspaper']]
y_train = train[['Sales']]

# test data의 feature, label 분리하기
X_test = test[['TV', 'Radio', 'Newspaper']]
y_test = test[['Sales']]

## 데이터 모델링

* 선형 회귀 모델 이용

In [11]:
# 선형회귀 객체 만들기
model = linear_model.LinearRegression()

# train data로 학습시키기
model.fit(X_train, y_train)

# train data로 예측하기 (--> 실무용 ! : 모델의 적합도, 과적합 여부 판단을 위함.)
y_train_pred = model.predict(X_train)

# test data로 예측하기
y_test_pred = model.predict(X_test)

In [12]:
# 기울기 확인하기
model.coef_

array([[ 0.04638909,  0.18867512, -0.0024597 ]])

In [13]:
# train MSE
print('train MSE :', mean_squared_error(y_train, y_train_pred))

# test MSE
print('test MSE :', mean_squared_error(y_test, y_test_pred))

train MSE : 2.8274188814916763
test MSE : 2.4528179307176847


In [14]:
# R^2 확인 --> train보다 test의 점수가 더 높으면 과소적합 의심하기 --> 데이터의 개수를 늘려야한다.
# train
print('train R^2 :', r2_score(y_train, y_train_pred))

# test
print('test R^2 :', r2_score(y_test, y_test_pred))

train R^2 : 0.8923555807586848
test R^2 : 0.9288231093749743


* statsmodel 이용
    * 반드시 **x0** 추가 !!

In [15]:
sm_X_train = X_train.copy()
sm_X_test = X_test.copy()

# x0 추가
sm_X_train['x0'] = 1
sm_X_test['x0'] = 1

In [16]:
# train data 학습하기
result = sm.OLS(y_train, sm_X_train).fit()

# train data 결과 확인
result.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.891
Method:,Least Squares,F-statistic:,486.3
Date:,"Mon, 17 Jun 2024",Prob (F-statistic):,6.570000000000001e-85
Time:,14:21:18,Log-Likelihood:,-348.95
No. Observations:,180,AIC:,705.9
Df Residuals:,176,BIC:,718.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0464,0.001,31.154,0.000,0.043,0.049
Radio,0.1887,0.009,20.347,0.000,0.170,0.207
Newspaper,-0.0025,0.006,-0.395,0.693,-0.015,0.010
x0,2.8399,0.342,8.293,0.000,2.164,3.516

0,1,2,3
Omnibus:,56.196,Durbin-Watson:,2.104
Prob(Omnibus):,0.0,Jarque-Bera (JB):,140.467
Skew:,-1.343,Prob(JB):,3.15e-31
Kurtosis:,6.394,Cond. No.,467.0


# 다항 회귀 분석

In [17]:
from sklearn.preprocessing import PolynomialFeatures

* 간단한 예제

In [18]:
# 데이터 만들기
x = np.arange(6).reshape(3, 2)

# [1, a, b, a^2, ab, b^2] feature 만들기
poly = PolynomialFeatures(2)
poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [19]:
# interaction feature만 생성하고 싶을 때
PolynomialFeatures(interaction_only=True).fit_transform(x)

array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

* 실데이터에 적용하자

## 데이터 준비

In [46]:
raw = pd.read_csv('../datasets/ML_data/Auto.csv')
auto = raw.copy()
print(auto.shape)
auto.head(2)

(397, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320


In [47]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object 
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


## 데이터 전처리

In [49]:
# 결측치 확인하기
auto.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [48]:
# horsepower 확인하기
auto.horsepower.unique() #--> '?' 존재

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [50]:
# '?' --> '0'
auto.horsepower = auto.horsepower.replace('?', '0')

# 숫자로 데이터 타입 변경
auto.horsepower = pd.to_numeric(auto.horsepower)

In [51]:
auto.info() # --> horsepower 데이터 타입 int 변경 확인 완료

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    int64  
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.0+ KB


In [57]:
# horsepower에 대한 다양한 feature 생성하기
auto['horsepower_2'] = auto.horsepower ** 2
# auto['horsepower_3'] = auto.horsepower ** 3
# auto['horsepower_4'] = auto.horsepower ** 4
# auto['horsepower_5'] = auto.horsepower ** 5

In [58]:
# weight에 대한 다양한 feature 생성하기
auto['weight_2'] = auto.weight ** 2
# auto['weight_3'] = auto.weight ** 3
# auto['weight_4'] = auto.weight ** 4
# auto['weight_5'] = auto.weight ** 5

In [61]:
auto_new = auto[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'horsepower_2', 'weight_2']]

In [62]:
# 데이터 분리하기
# train / test 분리
train = auto_new[:-40]
test = auto_new[-40:]

# train data의 feature, label 분리하기
X_train = train.iloc[:, 1:]
y_train = train[['mpg']]

# test data의 feature, label 분리하기
X_test = test.iloc[:, 1:]
y_test = test[['mpg']]

## 데이터 모델링

* auto 다중선형회귀 (sklearn)

In [63]:
# 모델 생성
model = linear_model.LinearRegression()

# train data로 학습하기
model.fit(X_train, y_train)

# train data로 예측하기
y_train_pred = model.predict(X_train)

# test data로 예측하기
y_test_pred = model.predict(X_test)

In [64]:
# 기울기
model.coef_

array([[ 1.79631495e-02,  2.19582716e-03, -5.69324477e-02,
        -1.86702200e-02,  2.17092386e-02,  7.86730375e-01,
         6.67420780e-01,  1.76005475e-04,  2.00279982e-06]])

In [65]:
# MSE
print('train MSE :', mean_squared_error(y_train, y_train_pred))
print('test MSE :', mean_squared_error(y_test, y_test_pred))

train MSE : 8.113109015284703
test MSE : 13.467169758134819


In [66]:
# r2_score
print('train R^2 :', r2_score(y_train, y_train_pred))
print('test R^2 :', r2_score(y_test, y_test_pred))

train R^2 : 0.8607104046047545
test R^2 : 0.5834749147470137
