In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris

from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

import warnings
warnings.simplefilter('ignore')

In [2]:
train_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name = 'train')
test_df = pd.read_excel('data4/hyundaiCar.xlsx', sheet_name = 'test')

In [3]:
# (특성데이터)년식	종류	연비	마력	토크	연료	하이브리드	배기량	중량	변속기 을 통해서 
# 가격(label)을 예측하는 것.

In [4]:
x_train = train_df.iloc[:,1:]
y_train = train_df['가격']

In [5]:
x_train.head(3)

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동


In [6]:
y_train.head(3)

0    1885
1    2190
2    1135
Name: 가격, dtype: int64

In [7]:
x_test = test_df.iloc[:,1:]
y_test = test_df['가격']

In [8]:
# train_set, test_set 완성
# 문자열 수치화시켜줘야함.
# 인코딩하는법(딥러닝에서도 상당히 많이 쓰임, 자연어처리) 알려줄 것임.

## 문자열 encoding
### label encoding
### one hot encoding


In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [10]:
x_train['종류']

0     준중형
1     준중형
2      소형
3      소형
4      대형
     ... 
66     중형
67     소형
68    준중형
69     중형
70     대형
Name: 종류, Length: 71, dtype: object

## Label Encoder

In [11]:
lbl = LabelEncoder() # 0부터 일련의 숫자를 부여
x_trainLabel = lbl.fit_transform(x_train['종류'])
x_trainLabel

array([2, 2, 1, 1, 0, 3, 3, 1, 3, 1, 2, 3, 2, 0, 1, 0, 0, 0, 3, 0, 0, 3,
       2, 0, 3, 3, 3, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0,
       0, 2, 1, 0, 2, 1, 3, 3, 0, 2, 2, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 1,
       3, 1, 2, 3, 0])

In [12]:
lbl.classes_

array(['대형', '소형', '준중형', '중형'], dtype=object)

In [13]:
lbl.classes_[2]

'준중형'

In [14]:
lbl.transform(['소형']) # 문자열을 줬을 때 숫자열을 알고 싶을 때

array([1])

## OneHotEncoder (0과 1로만 구성)

In [15]:
oneH = OneHotEncoder()
x_trainOne = oneH.fit_transform(x_train['종류'].values.reshape(-1,1))
x_trainOne

<71x4 sparse matrix of type '<class 'numpy.float64'>'
	with 71 stored elements in Compressed Sparse Row format>

In [16]:
type(x_trainOne) #csr_matrix 라는 함수객체

scipy.sparse.csr.csr_matrix

In [17]:
x_trainOne.toarray() # ndarray로 타입 변경 , onehot으로 바뀐 결과값임.
# 0010이 준준형, .....


array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],


## 판다스 dummy를 이용해서 onehotencoding사용

In [18]:
pd.get_dummies(x_train['종류'])

Unnamed: 0,대형,소형,준중형,중형
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
66,0,0,0,1
67,0,1,0,0
68,0,0,1,0
69,0,0,0,1


In [19]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


In [20]:
pd.get_dummies(x_train, columns=['연료','변속기'])

Unnamed: 0,년식,종류,연비,마력,토크,하이브리드,배기량,중량,연료_LPG,연료_가솔린,연료_디젤,변속기_수동,변속기_자동
0,2015,준중형,11.8,172,21.0,0,1999,1300,0,1,0,0,1
1,2015,준중형,12.3,204,27.0,0,1591,1300,0,1,0,0,1
2,2015,소형,15.0,100,13.6,0,1368,1035,0,1,0,1,0
3,2014,소형,14.0,140,17.0,0,1591,1090,0,1,0,0,1
4,2015,대형,9.6,175,46.0,0,2497,1990,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,0,3342,1901,0,1,0,0,1
67,2012,소형,13.3,108,13.9,0,1396,1040,0,1,0,0,1
68,2015,준중형,12.8,186,41.0,0,1995,1665,0,0,1,0,1
69,2015,중형,17.7,156,19.3,1,1999,1585,0,1,0,0,1


## replace

In [21]:
x_train['종류'].replace(['대형','중형','준중형','소형'],[0,1,2,3])

0     2
1     2
2     3
3     3
4     0
     ..
66    1
67    3
68    2
69    1
70    0
Name: 종류, Length: 71, dtype: int64

# make_column_transformer

In [22]:
from sklearn.compose import make_column_transformer

In [23]:
# OneHotEncoder가 숫자에 크기가 없기 떄문에 좀 더 좋은 점수를 얻어올 수 있따.
myt = make_column_transformer((OneHotEncoder(), ['종류','연료','변속기']))  # 튜플로해줘야함, 얘를 파이프라인으로 연결하면 됨..
result = myt.fit_transform(x_train)
result

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0.

In [24]:
# # model =  Pipeline([('myt',myt), ('scl',StandardScaler()),('clf',Ridge())] ) -> 이걸 쓰는게 베스트긴 함.
# model = make_pipeline(myt, StandardScaler(), ElasticNet() ) # 이름으로 줄때 소문자로 줘야함. Param_value

# param_value = {'elasticnet__alpha' : [0.001,0.01,1,2,3], elasticnet_ratio=[0.1,0.3,0.5,0.7,0.9] } # 칼럼이 많고, 다중공선성이 많다고 생각해서 Ridgit에서 바뀜
# gridS = GridSearchCV( model, param_grid=param_value, scoring='r2')
# gridS.fit(x_train, y_train)

In [28]:
# model = Pipeline( [ ('myt', myt ), ('scl', StandardScaler() ), ('clf', ElasticNet()) ]  )
model = make_pipeline( myt, StandardScaler(), ElasticNet() )
# param_value = {'ridge__alpha':[0.001,0.01,1,2,3] }
param_value = {'elasticnet__alpha':[0.001,0.01,1,2,3] ,'elasticnet__l1_ratio':[0.1,0.3,0.5,0.7, 0.9]}
gridS = GridSearchCV( model, param_grid=param_value, scoring='r2' )
gridS.fit( x_train, y_train )


GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['종류',
                                                                          '연료',
                                                                          '변속기'])])),
                                       ('standardscaler', StandardScaler()),
                                       ('elasticnet', ElasticNet())]),
             param_grid={'elasticnet__alpha': [0.001, 0.01, 1, 2, 3],
                         'elasticnet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]},
             scoring='r2')

In [29]:
gridS.best_params_

{'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0.5}

In [30]:
gridS.best_score_

0.12531389071867122

In [31]:
gridS.best_estimator_.predict(x_test)

array([2023.25644269, 1939.36822852, 2749.45263254, 2502.2607233 ,
       2603.17278364, 1545.84237162, 1629.88358583, 2502.2607233 ,
       2209.79992678, 2023.40944273, 3142.97848944, 3142.97848944,
       1545.84237162, 2209.79992678, 3475.70292115,  965.92603067,
       1629.88358583, 1939.36822852, 1939.36822852, 2023.40944273,
       1359.45188757, 2023.40944273, 3475.70292115, 2603.32578368,
       2603.32578368, 2603.32578368, 3475.70292115, 2502.2607233 ,
       3082.17706425, 1939.36822852, 2169.53629159])

In [36]:
x_test.columns

Index(['년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기'], dtype='object')

In [39]:
xd = np.array( [[2015, '준중형', 12.3, 204, 27,'가솔린',0,1591,1300,'자동']])
xdataF = pd.DataFrame(xd, columns = x_test.columns)
xdataF


Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,12.3,204,27,가솔린,0,1591,1300,자동


In [40]:
gridS.best_estimator_.predict( xdataF )

array([2603.32578368])

In [None]:
x_train