# Iris 데이터 다중회귀분석

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
df['class'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

## Petal Length에 대한 회귀식 만들기

In [5]:
X = df.iloc[:,[0,1,3,4]].values
y = df.iloc[:,2].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.10, RMSE: 0.31
R_squared: 0.9682


In [8]:
lr.coef_

array([ 0.68227323, -0.51925158,  1.11668415,  0.37500635])

In [9]:
lr.intercept_

-0.3669994939603338

#### Petal Length에 대한 회귀식
- pl = 0.682*sl - 0.519*sw + 1.116*pw + 0.375*cl -0.366

#### Petal Length 교차검증

In [10]:
# 5 폴드 세트로 MSE를 구함
neg_mean_scores = cross_val_score(lr, X, y,
                                scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE: ',np.round(neg_mean_scores,2))
print('개별 RMSE: ',np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE:  [-0.06 -0.08 -0.14 -0.1  -0.21]
개별 RMSE:  [0.25 0.28 0.38 0.32 0.46]
평균 RMSE: 0.3387


## Petal Width에 대한 회귀식 만들기

In [11]:
X = df.iloc[:,[0,1,2,4]].values
y = df.iloc[:,3].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.04, RMSE: 0.19
R_squared: 0.9226


In [12]:
lr.coef_

array([-0.12876754,  0.17861614,  0.32741898,  0.37562097])

In [13]:
lr.intercept_

-0.19445016041642238

#### Petal Width에 대한 회귀식
- pw = -0.128*sl + 0.178*sw + 0.327*pl + 0.375*cl - 0.194

#### Petal Width 교차검증

In [14]:
# 5 폴드 세트로 MSE를 구함
neg_mean_scores = cross_val_score(lr, X, y,
                                scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE: ',np.round(neg_mean_scores,2))
print('개별 RMSE: ',np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE:  [-0.01 -0.02 -0.02 -0.04 -0.07]
개별 RMSE:  [0.09 0.14 0.15 0.19 0.26]
평균 RMSE: 0.1667


## Sepal Length에 대한 회귀식 만들기

In [15]:
X = df.iloc[:,[1,2,3,4]].values
y = df.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.07, RMSE: 0.27
R_squared: 0.8808


In [16]:
lr.coef_

array([ 0.60733316,  0.77406606, -0.49825608, -0.21265098])

In [17]:
lr.intercept_

1.9046012133683519

#### Sepal Length에 대한 회귀식
- sl = 0.607*sw + 0774*pl - 0.498*pw - 0.212*cl + 1.904

#### Sepal Length 교차검증

In [18]:
# 5 폴드 세트로 MSE를 구함
neg_mean_scores = cross_val_score(lr, X, y,
                                scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE: ',np.round(neg_mean_scores,2))
print('개별 RMSE: ',np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE:  [-0.06 -0.08 -0.14 -0.1  -0.14]
개별 RMSE:  [0.25 0.29 0.37 0.31 0.38]
평균 RMSE: 0.3199


## Sepal Width에 대한 회귀식 만들기

In [19]:
X = df.iloc[:,[0,2,3,4]].values
y = df.iloc[:,1].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.05, RMSE: 0.22
R_squared: 0.5667


In [20]:
lr.coef_

array([ 0.61339502, -0.59499152,  0.69803975, -0.12580651])

In [21]:
lr.intercept_

0.9976767818176961

#### Sepal Width에 대한 회귀식
- sw = 0.613*sl -0.594*pl + 0.698*pw -0.125*cl + 0.997

#### Sepal Width 교차검증

In [22]:
# 5 폴드 세트로 MSE를 구함
neg_mean_scores = cross_val_score(lr, X, y,
                                scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE: ',np.round(neg_mean_scores,2))
print('개별 RMSE: ',np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE:  [-0.08 -0.11 -0.16 -0.08 -0.11]
개별 RMSE:  [0.28 0.33 0.41 0.29 0.33]
평균 RMSE: 0.3289
