In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices, dmatrix, build_design_matrices
from sklearn import linear_model
from sklearn.model_selection import train_test_split
%matplotlib inline

# Set up

In [45]:
df = (pd.read_csv("data/imports-85.data", header=None, 
                 names=['symbol', 'normalized_losses', 'make', 'fuel_type',
                       'aspiration', 'num_doors', 'body_style', 'drive_wheels',
                       'engine_location', 'wheel_base', 'length', 'width',
                       'height', 'curb_weight', 'engine_type', 'num_cylinders',
                       'engine_size', 'fuel_system', 'bore', 'stroke',
                       'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
                       'highway_mpg', 'price'])
     .query("(price != '?') & (horsepower != '?')"))
df['price'] = df['price'].astype('float')
df['horsepower'] = df['horsepower'].astype('float')

In [46]:
df.head()

Unnamed: 0,symbol,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000,21,27,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000,21,27,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000,19,26,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500,24,30,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500,18,22,17450.0


# Basic use

* One hot encode make
* engine size (no transform)
* log horsepower
* mean city and highway mpg
* no intercept

In [57]:
formula = "price ~ make + engine_size + np.log(horsepower) + I((city_mpg + highway_mpg)/2) -1"
y, X = dmatrices(formula, df, return_type='dataframe')

In [50]:
X.head()

Unnamed: 0,make[alfa-romero],make[audi],make[bmw],make[chevrolet],make[dodge],make[honda],make[isuzu],make[jaguar],make[mazda],make[mercedes-benz],...,make[plymouth],make[porsche],make[saab],make[subaru],make[toyota],make[volkswagen],make[volvo],engine_size,np.log(horsepower),I((city_mpg + highway_mpg) / 2)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,4.70953,24.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,4.70953,24.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,152.0,5.036953,22.5
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0,4.624973,27.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0,4.744932,20.0


In [51]:
reg = linear_model.LassoCV(cv=5).fit(X, y)

  y = column_or_1d(y, warn=True)


# Stateful transforms

Useful for making transformations on training data and then applying them to the test set

In [90]:
X, y = df.drop('price',axis=1), df[['price']]

Apply the same transformations from above only to X

In [93]:
formula = "make + engine_size + np.log(horsepower) + I((city_mpg + highway_mpg)/2) -1"
X = dmatrix(formula, X, return_type='dataframe')

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Center price based on the training data then apply the same transformation to the test data

In [96]:
y_train = dmatrix("center(price)-1", y_train, return_type='dataframe')
y_test = build_design_matrices([y_train.design_info], y_test)

In [97]:
reg = linear_model.LassoCV(cv=5).fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
