# Data preprocessing and pipelines


In [1]:
# General imports
%matplotlib inline
import pandas as pd
import openml as oml
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier,GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR,SVC
from tqdm.notebook import tqdm as tqdm

In [2]:
data = pd.read_csv("carprice.csv")
data.drop(columns=["car_ID", "symboling"], inplace=True)
X = data.drop(columns=["price"])
y = data["price"]

In [3]:
X.head()

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30
4,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CarName           205 non-null    object 
 1   fueltype          205 non-null    object 
 2   aspiration        205 non-null    object 
 3   doornumber        205 non-null    object 
 4   carbody           205 non-null    object 
 5   drivewheel        205 non-null    object 
 6   enginelocation    205 non-null    object 
 7   wheelbase         205 non-null    float64
 8   carlength         205 non-null    float64
 9   carwidth          205 non-null    float64
 10  carheight         205 non-null    float64
 11  curbweight        205 non-null    int64  
 12  enginetype        205 non-null    object 
 13  cylindernumber    205 non-null    object 
 14  enginesize        205 non-null    int64  
 15  fuelsystem        205 non-null    object 
 16  boreratio         205 non-null    float64
 1

In [5]:
y.head()

0    13495.0
1    16500.0
2    16500.0
3    13950.0
4    17450.0
Name: price, dtype: float64

## Exercise 1: Build a pipeline

Implement a function `build_pipeline` that does the following:
- Impute missing values by replacing NaN's with the feature median for numerical features.
- Encode the categorical features using OneHotEncoding.
- Scale the data using min-max scaling.
- Attach the given regression model to the end of the pipeline

In [6]:
def build_pipeline(regressor, categorical):
    cat_pipe = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
    num_pipe = make_pipeline(SimpleImputer(strategy='mean'))
    num_pipe.steps.insert(1,["scaler", MinMaxScaler()]) 
    transform = make_column_transformer((cat_pipe, categorical), remainder=num_pipe)
    return Pipeline(steps=[('preprocess', transform), ('reg', regressor)])

## Exercise 2: Test the pipeline
Test the pipeline by evaluating linear regression (without scaling) on the dataset, using 5-fold cross-validation and $R^2$. 

In [7]:
categorical = X.select_dtypes(include=["object"]).columns
regressor = LinearRegression()
pipe = build_pipeline(regressor,categorical)
scores = cross_val_score(pipe, X, y)
print("Cross-validated R^2 score for {}: {:.2f}".format(regressor.__class__.__name__, scores.mean()))

Cross-validated R^2 score for LinearRegression: -0.27


## Exercise 3: A first benchmark
Evaluate the following algorithms in their default settings, both with and without scaling, and interpret the results:  
- Linear regression
- Ridge
- Lasso
- SVM (RBF)
- RandomForests
- GradientBoosting

In [8]:
### Model solution
models = [LinearRegression(), Ridge(), Lasso(), RandomForestRegressor(), GradientBoostingRegressor(), SVR()]
for m in tqdm(models): # nstantly make your loops show a smart progress meter
    pipe = build_pipeline(m,categorical)
    scores = cross_val_score(pipe, X, y)
    print("R^2 score for {}: {:.2f}".format(m.__class__.__name__, scores.mean()))
    pipe = build_pipeline(m,categorical)
    scores = cross_val_score(pipe, X, y)
    print("R^2 score for {} (scaled): {:.2f}".format(m.__class__.__name__, scores.mean()))

  0%|          | 0/6 [00:00<?, ?it/s]

R^2 score for LinearRegression: -0.27
R^2 score for LinearRegression (scaled): -0.27
R^2 score for Ridge: 0.40
R^2 score for Ridge (scaled): 0.40


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


R^2 score for Lasso: 0.36


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


R^2 score for Lasso (scaled): 0.36
R^2 score for RandomForestRegressor: 0.42
R^2 score for RandomForestRegressor (scaled): 0.39
R^2 score for GradientBoostingRegressor: 0.53
R^2 score for GradientBoostingRegressor (scaled): 0.53
R^2 score for SVR: -0.20
R^2 score for SVR (scaled): -0.20


## Exercise 4: Classification

1. Use the original dataset and convert the target columns to 0 (cheap) / 1 (expensive)
2. Re-evaluate the pipeline. What are the conclusions?

In [9]:
m = data["price"].mean()
data.loc[data["price"] < m, "price"] = 0
data.loc[data["price"] >= m, "price"] = 1
X = data.drop(columns=["price"])
y = data["price"]

In [10]:
### Model solution
models = [RandomForestClassifier(), GradientBoostingClassifier(), SVC()]
for m in tqdm(models): # nstantly make your loops show a smart progress meter
    pipe = build_pipeline(m,categorical)
    scores = cross_val_score(pipe, X, y)
    print("R^2 score for {}: {:.2f}".format(m.__class__.__name__, scores.mean()))
    pipe = build_pipeline(m,categorical)
    scores = cross_val_score(pipe, X, y)
    print("R^2 score for {} (scaled): {:.2f}".format(m.__class__.__name__, scores.mean()))

  0%|          | 0/3 [00:00<?, ?it/s]

R^2 score for RandomForestClassifier: 0.87
R^2 score for RandomForestClassifier (scaled): 0.88
R^2 score for GradientBoostingClassifier: 0.85
R^2 score for GradientBoostingClassifier (scaled): 0.85
R^2 score for SVC: 0.86
R^2 score for SVC (scaled): 0.86
