# Bonus

👇 Consider the following dataset

In [403]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate


data = pd.read_csv("data.csv")

data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


👇 Build an optimal pipeline to predict the price of cars according to their specificities. Once your pipeline is ready, use `permutation_importance` to find out which feature is the most informative of the car price.

In [389]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [390]:
def brands_encoding(data):
    """Function takes data.CarName and returns encoded column."""
    brands = {'alfa-romero': 0,
              'audi': 1,
              'bmw': 2,
              'chevrolet': 3,
              'dodge': 4,
              'honda': 5,
              'isuzu': 6,
              'jaguar': 7,
              'maxda': 8,
              'mazda': 8,
              'buick': 10,
              'mercury': 11,
              'mitsubishi': 12,
              'nissan': 13,
              'peugeot': 14,
              'plymouth': 15,
              'porsche': 16,
              'porcshce': 16,
              'renault': 18,
              'saab': 19,
              'subaru': 20,
              'toyota': 21,
              'toyouta': 21,
              'vokswagen': 23,
              'volkswagen': 23,
              'vw': 23,
              'volvo': 26}
    return data.map(lambda x: brands[x.split()[0].lower()])

In [391]:
def categorical_encoding(data):
    """Function takes dataframe with categorical columns and encodes them from zero to length."""
    
    encoding = {'fueltype': {'gas': 0, 'diesel': 1},
                 'aspiration': {'std': 0, 'turbo': 1},
                 'doornumber': {'two': 2, 'four': 4},
                 'carbody': {'convertible': 0, 'hatchback': 1, 'sedan': 2, 'wagon': 3, 'hardtop': 4},
                 'drivewheel': {'rwd': 0, 'fwd': 1, '4wd': 2},
                 'enginelocation': {'front': 0, 'rear': 1},
                 'fuelsystem': {'mpfi': 0, '2bbl': 1 , 'mfi': 2, '1bbl': 3, 'spfi': 4, '4bbl': 5, 'idi': 6, 'spdi': 7},
                 'enginetype': {'dohc': 0, 'ohcv': 1, 'ohc': 2, 'l': 3, 'rotor': 4, 'ohcf': 5, 'dohcv': 6},
                 'cylindernumber': {'four': 0, 'six': 1, 'five': 2, 'three': 3, 'twelve': 4, 'two': 5, 'eight':6}
                }
    
    for column in data[encoding].columns:
        data[column] = data[column].map(encoding[column])
        
    return data

In [392]:
def numerical_transformer(data):
    numerical = ['wheelbase','carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize',
                 'boreratio', 'stroke','compressionratio', 'horsepower', 'peakrpm', 'citympg',
                 'highwaympg']
        
    num_transformer = Pipeline([
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ])

    num_transformer.fit(data[numerical])
    data[numerical] = num_transformer.transform(data[numerical])
    
    return data

In [393]:
def pipe(data):
    data.drop_duplicates()
    
    data['brands'] = brands_encoding(data.CarName)
    
    data = categorical_encoding(data.drop(columns=['car_ID', 'CarName']))
    
    data = numerical_transformer(data)
    
    return data

In [394]:
df = pipe(data)

In [395]:
X = df.drop(columns='price')
y = df[['price']]

In [396]:
X

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,brands
0,3,0,0,2,0,0,0,-1.690772,-0.426521,-0.844782,...,0.074449,0,0.519071,-1.839377,-0.288349,0.174483,-0.262960,-0.646553,-0.546059,0
1,3,0,0,2,0,0,0,-1.690772,-0.426521,-0.844782,...,0.074449,0,0.519071,-1.839377,-0.288349,0.174483,-0.262960,-0.646553,-0.546059,0
2,1,0,0,2,1,0,0,-0.708596,-0.231513,-0.190566,...,0.604046,0,-2.404880,0.685946,-0.288349,1.264536,-0.262960,-0.953012,-0.691627,0
3,2,0,0,4,2,1,0,0.173698,0.207256,0.136542,...,-0.431076,0,-0.517266,0.462183,-0.035973,-0.053668,0.787855,-0.186865,-0.109354,1
4,2,0,0,4,2,2,0,0.107110,0.207256,0.230001,...,0.218885,0,-0.517266,0.462183,-0.540725,0.275883,0.787855,-1.106241,-1.273900,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,0,0,4,2,0,0,1.721873,1.198549,1.398245,...,0.339248,0,1.666445,-0.336970,-0.162161,0.250533,0.577692,-0.340094,-0.400490,26
201,-1,0,1,4,2,0,0,1.721873,1.198549,1.351515,...,0.339248,0,1.666445,-0.336970,-0.364062,1.416637,0.367529,-0.953012,-0.837195,26
202,-1,0,0,4,2,0,0,1.721873,1.198549,1.398245,...,1.109571,0,0.926204,-1.232021,-0.338824,0.757535,0.787855,-1.106241,-1.128332,26
203,-1,1,1,4,2,0,0,1.721873,1.198549,1.398245,...,0.435538,6,-1.183483,0.462183,3.244916,0.047732,-0.683286,0.119594,-0.546059,26


In [397]:
y

Unnamed: 0,price
0,13495.0
1,16500.0
2,16500.0
3,13950.0
4,17450.0
...,...
200,16845.0
201,19045.0
202,21485.0
203,22470.0


In [406]:
model = LinearRegression()

score = cross_validate(model, X, y, cv = 5, scoring='r2')

In [408]:
score['test_score'].mean()

0.5433424699102941

In [409]:
from sklearn.inspection import permutation_importance

model = LinearRegression().fit(X, y) # Fit model

permutation_score = permutation_importance(model, X, y, n_repeats=10) # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
14,enginesize,0.510449
6,enginelocation,0.078132
22,highwaympg,0.072534
13,cylindernumber,0.056531
11,curbweight,0.051311
21,citympg,0.045438
1,fueltype,0.036507
23,brands,0.025724
2,aspiration,0.025664
20,peakrpm,0.019551
