In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris

from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')

In [2]:
train_df = pd.read_excel( 'data4/hyundaiCar.xlsx', sheet_name='train')
test_df = pd.read_excel( 'data4/hyundaiCar.xlsx', sheet_name='test')

In [3]:
x_train = train_df.iloc[:, 1:]
y_train = train_df['가격']

In [4]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


In [5]:
y_train

0     1885
1     2190
2     1135
3     1645
4     1960
      ... 
66    3802
67    1270
68    2430
69    2870
70    3254
Name: 가격, Length: 71, dtype: int64

In [6]:
x_test = test_df.iloc[:, 1:]
y_test = test_df['가격']

## 문자열 encoding 
### label encoding
### one hot encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [8]:
x_train['종류']

0     준중형
1     준중형
2      소형
3      소형
4      대형
     ... 
66     중형
67     소형
68    준중형
69     중형
70     대형
Name: 종류, Length: 71, dtype: object

### Label Encoder

In [9]:
lbl = LabelEncoder()
x_trainLabel = lbl.fit_transform( x_train['종류'])
x_trainLabel

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [10]:
lbl.classes_

array(['대형', '소형', '준중형', '중형'], dtype=object)

In [11]:
lbl.classes_[2]

'준중형'

In [12]:
lbl.transform( ['소형'])

array([1])

### onehotEncoder (0과 1로 구성)

In [13]:
oneH = OneHotEncoder()
x_trainOne = oneH.fit_transform( x_train['종류'].values.reshape( -1,1) )
x_trainOne

<71x4 sparse matrix of type '<class 'numpy.float64'>'
	with 71 stored elements in Compressed Sparse Row format>

In [14]:
type( x_trainOne)

scipy.sparse.csr.csr_matrix

In [15]:
x_trainOne.toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],


In [16]:
oneH.categories_

[array(['대형', '소형', '준중형', '중형'], dtype=object)]

### 판다스 dummpy

In [17]:
pd.get_dummies( x_train['종류'] )

Unnamed: 0,대형,소형,준중형,중형
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
66,0,0,0,1
67,0,1,0,0
68,0,0,1,0
69,0,0,0,1


In [18]:
pd.get_dummies( x_train)

Unnamed: 0,년식,연비,마력,토크,하이브리드,배기량,중량,종류_대형,종류_소형,종류_준중형,종류_중형,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,11.8,172,21.0,0,1999,1300,0,0,1,0,0,1,0,0,1
1,2015,12.3,204,27.0,0,1591,1300,0,0,1,0,0,1,0,0,1
2,2015,15.0,100,13.6,0,1368,1035,0,1,0,0,0,1,0,1,0
3,2014,14.0,140,17.0,0,1591,1090,0,1,0,0,0,1,0,0,1
4,2015,9.6,175,46.0,0,2497,1990,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,8.5,290,34.8,0,3342,1901,0,0,0,1,0,1,0,0,1
67,2012,13.3,108,13.9,0,1396,1040,0,1,0,0,0,1,0,0,1
68,2015,12.8,186,41.0,0,1995,1665,0,0,1,0,0,0,1,0,1
69,2015,17.7,156,19.3,1,1999,1585,0,0,0,1,0,1,0,0,1


In [19]:
pd.get_dummies( x_train ,columns=['연료','변속기'])

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,0,3342,1901,0,1,0,0,1
67,2012,소형,13.3,108,13.9,0,1396,1040,0,1,0,0,1
68,2015,준중형,12.8,186,41.0,0,1995,1665,0,0,1,0,1
69,2015,중형,17.7,156,19.3,1,1999,1585,0,1,0,0,1


In [20]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


## replace

In [21]:
x_train['종류'].replace( ['대형','중형','준중형','소형'], [0,1,2,3] )

0     2
1     2
2     3
3     3
4     0
     ..
66    1
67    3
68    2
69    1
70    0
Name: 종류, Length: 71, dtype: int64

## make_column_transformer

In [22]:
from sklearn.compose import make_column_transformer

In [23]:
myt = make_column_transformer(  ( OneHotEncoder(), ['종류','연료','변속기'] ) )
result = myt.fit_transform( x_train)
result

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0.

In [37]:
# model = Pipeline( [ ('myt', myt ), ('scl', StandardScaler() ), ('clf', ElasticNet()) ]  )
model = make_pipeline( myt, PolynomialFeatures(degree=2), StandardScaler(), ElasticNet() )
# param_value = {'ridge__alpha':[0.001,0.01,1,2,3] }
param_value = {'elasticnet__alpha':[0.001,0.01,1,2,3] ,'elasticnet__l1_ratio':[0.1,0.3,0.5,0.7, 0.9]}
gridS = GridSearchCV( model, param_grid=param_value, scoring='r2' )
gridS.fit( x_train, y_train )

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['종류',
                                                                          '연료',
                                                                          '변속기'])])),
                                       ('polynomialfeatures',
                                        PolynomialFeatures(degree=6)),
                                       ('standardscaler', StandardScaler()),
                                       ('elasticnet', ElasticNet())]),
             param_grid={'elasticnet__alpha': [0.001, 0.01, 1, 2, 3],
                         'elasticnet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]},
             scoring='r2')

In [38]:
gridS.best_params_

{'elasticnet__alpha': 3, 'elasticnet__l1_ratio': 0.1}

In [39]:
gridS.best_score_

0.4248067073012806

In [40]:
gridS.best_estimator_.predict( x_test)

array([1895.45499638, 1423.9895877 , 3088.40365426, 1941.8208739 ,
       1943.57499551, 1867.58221033, 2101.24813352, 1941.8208739 ,
       2248.28054005, 1672.89649987, 2842.97405384, 2842.97405384,
       1867.58221033, 2248.28054005, 4876.43844977, 1588.51704024,
       2101.24813352, 1423.9895877 , 1423.9895877 , 1672.89649987,
       1218.36763257, 1672.89649987, 4876.43844977, 2006.87386022,
       2006.87386022, 2006.87386022, 4876.43844977, 1941.8208739 ,
       2624.99020612, 1423.9895877 , 3048.3455502 ])

In [41]:
gridS.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['종류', '연료', '변속기'])])),
                ('polynomialfeatures', PolynomialFeatures(degree=6)),
                ('standardscaler', StandardScaler()),
                ('elasticnet', ElasticNet(alpha=3, l1_ratio=0.1))])

In [29]:
x_test.columns

Index(['년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기'], dtype='object')

In [42]:
xd = np.array( [[2015, '준중형', 12.3, 204, 27,'가솔린',0,1591,1300,'자동']])
xdataF = pd.DataFrame( xd, columns=x_test.columns)
xdataF

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,12.3,204,27,가솔린,0,1591,1300,자동


In [43]:
gridS.best_estimator_.predict( xdataF )

array([2006.87386022])

In [32]:
x_test

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,대형,6.8,159,23.0,LPG,0,2359,1935,수동
1,2012,소형,13.3,108,13.9,가솔린,0,1396,1035,자동
2,2015,중형,14.4,184,41.0,디젤,0,1995,1792,자동
3,2015,대형,10.9,175,46.0,디젤,0,2497,2210,수동
4,2015,대형,6.4,159,23.0,LPG,0,2359,1935,자동
5,2015,소형,18.0,136,30.6,디젤,0,1582,1160,자동
6,2015,준중형,13.9,184,41.0,디젤,0,1995,1611,수동
7,2015,대형,8.9,133,26.5,디젤,0,2497,1696,수동
8,2015,준중형,12.5,184,41.0,디젤,0,1995,1611,자동
9,2015,준중형,12.8,215,21.3,가솔린,0,1999,1216,수동


In [33]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동
