In [263]:
import pandas as pd
import numpy as np

col_name = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors',
            'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
            'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio',
            'horsepower','peak-rpm','city-mpg','highway-mpg','price']

data = pd.read_csv('imports-85.data', names=col_name)

data = data.replace('?', np.nan)

data[['bore','stroke','horsepower','peak-rpm','normalized-losses']] = data[['bore','stroke','horsepower','peak-rpm','normalized-losses']].astype(float)
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


In [264]:
data.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [265]:
data.dropna(subset=['price'], inplace=True)
X_full = data.drop('price', axis=1)
y = data.price
# y.dropna(inplace=True)

In [266]:
num_col = [cname for cname in X_full.columns if X_full[cname].dtypes in ['int64','float64']]
cate_col = [cname for cname in X_full.columns if X_full[cname].dtypes == 'object']
# binary_col = [cname for cname in X_full.columns if X_full[cname].dtypes == 'object' and X_full[cname].nunique() == 2]

full_col = num_col + cate_col

In [267]:
X_full[cate_col].nunique()

make                22
fuel-type            2
aspiration           2
num-of-doors         2
body-style           5
drive-wheels         3
engine-location      2
engine-type          6
num-of-cylinders     7
fuel-system          8
dtype: int64

In [268]:
y.isnull().sum()

0

In [269]:
X_full.isnull().sum()

symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
dtype: int64

In [270]:
X_full = X_full[full_col]

In [271]:

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

cate_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('oneHot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
# binary_transformer = Pipeline(steps=[('Label', LabelEncoder())])
num_transformer = Pipeline(steps=[
    ('num', SimpleImputer(strategy='mean')),
    ('feature',StandardScaler())
    ])


processor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_col),
    ('cat', cate_transformer, cate_col),
])


X = pd.DataFrame(processor.fit_transform(X_full[full_col]), columns=processor.get_feature_names_out(full_col))



In [272]:
X

Unnamed: 0,num__symboling,num__normalized-losses,num__wheel-base,num__length,num__width,num__height,num__curb-weight,num__engine-size,num__bore,num__stroke,...,cat__num-of-cylinders_twelve,cat__num-of-cylinders_two,cat__fuel-system_1bbl,cat__fuel-system_2bbl,cat__fuel-system_4bbl,cat__fuel-system_idi,cat__fuel-system_mfi,cat__fuel-system_mpfi,cat__fuel-system_spdi,cat__fuel-system_spfi
0,1.725050,0.000000,-1.685107,-0.439409,-0.853460,-2.034081,-0.014858,0.075389,0.520894,-1.829927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.725050,0.000000,-1.685107,-0.439409,-0.853460,-2.034081,-0.014858,0.075389,0.520894,-1.829927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.127193,0.000000,-0.710103,-0.244152,-0.185597,-0.559713,0.518080,0.606234,-2.433435,0.675938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.926121,1.315931,0.165748,0.195176,0.148335,0.218425,-0.423766,-0.431327,-0.526210,0.453899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.926121,1.315931,0.099646,0.195176,0.243744,0.218425,0.520017,0.220165,-0.526210,0.453899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1.470664,-0.845956,1.702619,1.187733,1.436357,0.709881,0.768075,0.340812,1.680188,-0.339096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
197,-1.470664,-0.845956,1.702619,1.187733,1.388653,0.709881,0.956057,0.340812,1.680188,-0.339096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
198,-1.470664,-0.845956,1.702619,1.187733,1.436357,0.709881,0.884353,1.112950,0.932257,-1.227251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
199,-1.470664,-0.845956,1.702619,1.187733,1.436357,0.709881,1.281633,0.437329,-1.199348,0.453899,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [273]:
print(X.isnull().sum())

num__symboling            0
num__normalized-losses    0
num__wheel-base           0
num__length               0
num__width                0
                         ..
cat__fuel-system_idi      0
cat__fuel-system_mfi      0
cat__fuel-system_mpfi     0
cat__fuel-system_spdi     0
cat__fuel-system_spfi     0
Length: 74, dtype: int64


In [274]:
from sklearn.metrics import mean_absolute_error
def getMea(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    print(np.concatenate((pred.reshape(len(pred), 1),y_test.values.reshape(len(y_test), 1) ), 1))
    
    return mean_absolute_error(y_test, pred)

In [275]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.2)

In [276]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
print("MEA:",getMea(model1, X_train, X_test, y_train, y_test))

[[5511.0 '6295']
 [10112.0 '10698']
 [16751.0 '13860']
 [14801.0 '13499']
 [17449.0 '15750']
 [10447.0 '8495']
 [17323.0 '15250']
 [6529.0 '5348']
 [23179.0 '21105']
 [7317.0 '6938']
 [10153.0 '11245']
 [38041.0 '37028']
 [9039.0 '7995']
 [5395.0 '7898']
 [13459.0 '14869']
 [21305.0 '18920']
 [8919.0 '7129']
 [29509.0 '15040']
 [9877.0 '9095']
 [6471.0 '6189']
 [9809.0 '9495']
 [11337.0 '11694']
 [491085763155517.0 '35550']
 [7581.0 '8058']
 [7790.0 '10795']
 [34636.0 '32528']
 [9773.0 '7975']
 [14233.0 '11595']
 [1019412645362267.0 '22018']
 [491085763155115.0 '32250']
 [36307.0 '36880']
 [-492559268390849.0 '15645']
 [4775.0 '7898']
 [17137.0 '17075']
 [10405.0 '7957']
 [13517.0 '12290']
 [15061.0 '12170']
 [17519.0 '17450']
 [9069.0 '8189']
 [13879.0 '12440']
 [5979.0 '5118']]
MEA: 60832766830622.83


In [277]:
from sklearn.tree import DecisionTreeRegressor
model2 = DecisionTreeRegressor(random_state=0)
print("MEA:",getMea(model2, X_train, X_test, y_train, y_test))


[[6575.0 '6295']
 [9258.0 '10698']
 [17425.0 '13860']
 [14399.0 '13499']
 [15690.0 '15750']
 [9549.0 '8495']
 [12945.0 '15250']
 [6338.0 '5348']
 [20970.0 '21105']
 [7198.0 '6938']
 [9370.0 '11245']
 [34028.0 '37028']
 [7775.0 '7995']
 [7775.0 '7898']
 [14489.0 '14869']
 [17710.0 '18920']
 [6529.0 '7129']
 [11850.0 '15040']
 [7895.0 '9095']
 [6229.0 '6189']
 [7775.0 '9495']
 [15510.0 '11694']
 [41315.0 '35550']
 [9258.0 '8058']
 [7126.0 '10795']
 [34028.0 '32528']
 [8195.0 '7975']
 [9980.0 '11595']
 [13295.0 '22018']
 [41315.0 '32250']
 [41315.0 '36880']
 [13645.0 '15645']
 [6918.0 '7898']
 [17425.0 '17075']
 [8558.0 '7957']
 [8778.0 '12290']
 [15510.0 '12170']
 [12964.0 '17450']
 [9279.0 '8189']
 [16695.0 '12440']
 [7053.0 '5118']]
MEA: 2124.487804878049


In [278]:
from sklearn.svm import SVR
model3 = SVR(kernel='rbf')
print("MEA:",getMea(model3, X_train, X_test, y_train, y_test))


[[9962.006117356292 '6295']
 [9988.329645259499 '10698']
 [10000.30213739626 '13860']
 [10003.678726013299 '13499']
 [10003.47806951749 '15750']
 [9976.107623879181 '8495']
 [9991.292945504421 '15250']
 [9962.710392659654 '5348']
 [9996.056783956094 '21105']
 [9964.826091881274 '6938']
 [9980.082115843648 '11245']
 [9996.544466325298 '37028']
 [9980.241315711548 '7995']
 [9979.475944799238 '7898']
 [9994.767808543294 '14869']
 [10000.9586828284 '18920']
 [9967.032679178268 '7129']
 [9989.785314661183 '15040']
 [9972.649798949926 '9095']
 [9964.172876340677 '6189']
 [9982.28362868852 '9495']
 [9990.94886774435 '11694']
 [9999.162135206594 '35550']
 [9971.125454034218 '8058']
 [9984.483171835147 '10795']
 [9996.430063243395 '32528']
 [9975.704924761776 '7975']
 [9977.488026633191 '11595']
 [9997.17443387529 '22018']
 [9999.040671196524 '32250']
 [10006.58630594265 '36880']
 [9992.502267679345 '15645']
 [9974.896859359638 '7898']
 [10000.299151063877 '17075']
 [9972.883824307453 '7957']
 

In [279]:
from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor()
print("MEA:",getMea(model4, X_train, X_test, y_train, y_test))

[[5762.73 '6295']
 [9905.31 '10698']
 [18586.12 '13860']
 [15234.29 '13499']
 [16962.42 '15750']
 [8588.7 '8495']
 [12612.971666666668 '15250']
 [6383.84 '5348']
 [19796.78 '21105']
 [7657.6 '6938']
 [9471.201666666666 '11245']
 [33085.06 '37028']
 [7785.2 '7995']
 [7836.31 '7898']
 [14063.39 '14869']
 [18577.63 '18920']
 [6667.483333333334 '7129']
 [14386.345 '15040']
 [8421.965 '9095']
 [6395.01 '6189']
 [9032.293333333333 '9495']
 [12442.416666666664 '11694']
 [38497.22 '35550']
 [8262.29 '8058']
 [9577.01 '10795']
 [32861.88 '32528']
 [8337.715 '7975']
 [9045.52 '11595']
 [14837.16 '22018']
 [38487.78 '32250']
 [36735.9 '36880']
 [13044.21 '15645']
 [7569.41 '7898']
 [18586.12 '17075']
 [8291.55 '7957']
 [11726.436666666668 '12290']
 [14743.86 '12170']
 [15398.86 '17450']
 [8506.828333333333 '8189']
 [16804.31 '12440']
 [7150.905 '5118']]
MEA: 1536.3823983739837
