In [74]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neural_network import MLPRegressor

#For pipeline creation
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
import math as math

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('automobileEDA.csv', header=0)
#Getting first column series when no name is as header
df

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.848630,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.848630,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95,volvo,std,four,sedan,rwd,front,109.1,0.907256,...,9.5,114.0,5400.0,23,28,16845.0,10.217391,Medium,0,1
197,-1,95,volvo,turbo,four,sedan,rwd,front,109.1,0.907256,...,8.7,160.0,5300.0,19,25,19045.0,12.368421,High,0,1
198,-1,95,volvo,std,four,sedan,rwd,front,109.1,0.907256,...,8.8,134.0,5500.0,18,23,21485.0,13.055556,Medium,0,1
199,-1,95,volvo,turbo,four,sedan,rwd,front,109.1,0.907256,...,23.0,106.0,4800.0,26,27,22470.0,9.038462,Medium,1,0


In [54]:
#Se observa que la transformación box-cox es la mejor para las variables numéricas continuas.
#Se observa que la variable binara 'paid' tiene que siempre ser convertida en booleana
#Además se requiere de hacer inputaciones por que existen valores nulos

NumericList = ['wheel-base','length','compression-ratio','horsepower','city-mpg','highway-mpg','city-L/100km','width','height','curb-weight','engine-size','bore','stroke']
NumericalCategoricalList = ['symboling','normalized-losses','peak-rpm']
StringCategoricalList = ['make','aspiration','num-of-doors','body-style','drive-wheels','engine-location','horsepower-binned','engine-type','num-of-cylinders','fuel-system']
BinaryList = ['diesel','gas']

#Define each pipeline for each type of category in the Dependent variables
NumericColumnsPipeline = Pipeline(steps= [('box-cox', PowerTransformer(method= 'box-cox')),('NumericInputter', SimpleImputer(strategy='median'))])

NumericalCategoricalColumnsPipeline = Pipeline(steps= [('NumericalCategoricalInputter', SimpleImputer(strategy='most_frequent'))])

StringCategoricalPipeline =  Pipeline(steps= [('StringCategoricalInputter', SimpleImputer(strategy='most_frequent')),
                                        ('CategoricalOneHotEncoder', OneHotEncoder(handle_unknown = 'ignore', drop = 'first'))])

BinaryColumnsPipeline = Pipeline(steps= [('BinaryInputter', SimpleImputer(strategy='most_frequent'))])

#Define the column transformer that applies pipelines to the dependent variable
preprocessor = ColumnTransformer(transformers = [('Numerical',NumericColumnsPipeline,NumericList),
                                                    ('NumericalCategorical',NumericalCategoricalColumnsPipeline,NumericalCategoricalList),
                                                    ('StringCategorical',StringCategoricalPipeline,StringCategoricalList),
                                                    ('Binary',BinaryColumnsPipeline,BinaryList)
                                                ],remainder='passthrough')

In [75]:
XtrainvalTransf = preprocessor.fit_transform(Xtv)

#optional
XtrainvalTransf_asDataFrame = pd.DataFrame(XtrainvalTransf)
#New_Labels=[]
#XtrainvalTransf_asDataFrame.columns = New_Labels
XtrainvalTransf_asDataFrame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,-0.513419,-0.404687,0.333474,1.036545,-1.746142,-1.215115,1.746142,0.064126,-1.665284,0.059502,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.425977,-0.346466,0.025404,-1.503096,0.410359,0.275856,-0.410359,-1.174528,2.090067,-0.403373,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.029367,1.991676,2.271105,0.010446,0.095594,-0.847750,-0.095594,1.226472,1.939445,1.639577,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.885518,-1.394690,-1.665073,0.232789,-0.071702,-0.023182,0.071702,-1.040850,-1.168771,-0.806279,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.458356,0.747637,0.025404,-0.237892,0.095594,0.128275,-0.095594,0.631215,0.555749,0.187159,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,2.133390,2.276393,2.283137,0.781957,-0.429073,-0.847750,0.429073,2.199524,1.022748,1.976759,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
166,1.028753,1.201894,0.402294,0.565216,-0.071702,-0.338690,0.071702,0.759640,0.984057,0.855218,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
167,-0.885518,-1.394690,0.333474,-1.147656,0.973304,1.090872,-0.973304,-1.040850,-1.251039,-1.254991,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
168,-1.034864,-1.240243,0.025404,-1.147656,0.839931,0.128275,-0.839931,-0.784169,0.161983,-1.489564,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [76]:
y = df[['price']]
X = df.drop(columns='price')


Xtv, Xtest, ytv, ytest = train_test_split(X, y, test_size=.15, random_state=0)

print(Xtv.shape, ': dimensión de datos de entrada "X" para entrenamiento y validación')
print(Xtest.shape, ': dimensión de datos de entrada "X" para prueba')  

print(ytv.shape, ': dimensión de variable de salida  "Y" para  entrenamiento y validación')
print(ytest.shape, ': dimensión de variable de salida "Y" para prueba')

(170, 28) : dimensión de datos de entrada "X" para entrenamiento y validación
(31, 28) : dimensión de datos de entrada "X" para prueba
(170, 1) : dimensión de variable de salida  "Y" para  entrenamiento y validación
(31, 1) : dimensión de variable de salida "Y" para prueba


In [77]:
import warnings
warnings.filterwarnings("ignore")

pipeline = Pipeline([("preprocessor", preprocessor), ("model", MLPRegressor())])

#The average R^2 on the test data for each of the two folds.
scores = cross_val_score(pipeline, X, y, cv=3)
scores

array([-0.15078643, -0.16245289,  0.09034091])

In [78]:
pipeline = Pipeline([("preprocessor", preprocessor), ("model", MLPRegressor())])

yhat = cross_val_predict(pipeline, X, y, cv=3)
yhat


array([11795.43497811, 11795.43497811, 11799.95128781, 13021.10034569,
       13038.3198307 , 12959.42383951, 13031.61775056, 12967.46037074,
       13039.05306164, 13772.74768633, 13768.56930941, 10184.79960682,
       10183.57840135, 10078.01915759, 12746.32718794, 12747.95159426,
       12753.86185412, 11946.66132894, 12636.21796052, 12604.49940435,
       12902.73730444, 12907.14505705, 12928.04824348, 12959.93569801,
       12960.52580458, 12960.52580458, 12980.90702937, 11768.29877608,
       11848.46591629, 11302.66648167, 14094.07038615, 12867.1798897 ,
       14035.03404237, 14035.16829292, 14054.97894915, 13999.64428469,
       13596.79932023, 13597.14038933, 13561.07050683, 13556.31591498,
       13568.71333262, 12909.05273456, 11315.41633756, 11790.74257025,
       11282.80437011, 11241.80925042, 11827.3729816 , 11735.07174526,
       11732.44940713, 11732.49315569, 11749.7879245 , 11750.22331794,
       14146.07013941, 14146.07013941, 14146.09949458, 14156.18146101,
      

In [85]:
parameters = [{
                'alpha':[0.01, 0.1, 1, 10, 100, 1000, 10000],
                'normalize':[True, False]
                }]

RR= Ridge()

grid = GridSearchCV(RR, parameters,cv=5)

grid.fit(X[['horsepower','curb-weight','engine-size','highway-mpg']],y)

grid.best_estimator_

scores = grid.cv_results_
scores['mean_test_score']

array([ 0.55101275,  0.54655068,  0.57425939,  0.54655122,  0.61248279,
        0.54655655,  0.27970009,  0.54660953, -0.05089277,  0.54710732,
       -0.10357156,  0.55012751, -0.10913095,  0.55329033])

In [84]:
grid.best_estimator_

In [86]:
grid.best_params_

{'alpha': 1, 'normalize': True}