### sklearn.preprocessing.PolynomialFeatures
* class sklearn.preprocessing.PolynomialFeatures(degree=2, *, interaction_only=False, include_bias=True, order='C')

In [20]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline


import pandas as pd

In [6]:
X = np.arange(4).reshape(2,2)
print('일차 단항식 계수 feature : \n', X)

print('\n')

poly = PolynomialFeatures(degree=2)
poly.fit(X)
poly_ftr = poly.transform(X)
print('변환된 2차 다항식 계수 feature : \n', poly_ftr)

일차 단항식 계수 feature : 
 [[0 1]
 [2 3]]


변환된 2차 다항식 계수 feature : 
 [[1. 0. 1. 0. 0. 1.]
 [1. 2. 3. 4. 6. 9.]]


In [9]:
def polynomial_func(X) : 
    y = 1 + 2*X + X**2 + X**3
    return y

X = np.arange(4).reshape(2,2)
print('일차 단항식 계수 feature : \n', X)
y = polynomial_func(X)
print('삼차 다항식 결정값 : \n', y)

## 3차 다항식 변환
poly = PolynomialFeatures(degree=3)
poly_ftr = poly.fit_transform(X)
print('3차 다항식 계수 feature : \n', poly_ftr)

# model = LinearRegression()
# model.fit(poly_ftr, y)
# print('Polynomial3차 다항식 계수 feature : \n', poly_ftr)


일차 단항식 계수 feature : 
 [[0 1]
 [2 3]]
삼차 다항식 결정값 : 
 [[ 1  5]
 [17 43]]
3차 다항식 계수 feature : 
 [[ 1.  0.  1.  0.  0.  1.  0.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.  8. 12. 18. 27.]]


In [11]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv("./datasets/housing.csv", header=None, delimiter=r"\s+", names=column_names)

In [13]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [16]:
data = df.drop("MEDV", axis =1 )
label = df["MEDV"]
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.3, random_state=156)

In [25]:
p_model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), ('linear', LinearRegression())])

In [26]:
p_model.fit(X_train, y_train)
pred = p_model.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)

print(f'MSE : {mse}, RMSE : {rmse}')
print(f'Variance score : {r2_score(y_test, pred)}')

MSE : 15.55575231599565, RMSE : 3.944078132592666
Variance score : 0.7816647162142003
