In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/cars_data.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


In [3]:
y = df['MSRP']
X = df.drop(columns=['MSRP'])

In [4]:
X = X.drop(columns=['Invoice'])

In [5]:
y = y.replace(to_replace='\$', value='', regex=True).replace(to_replace=',', value='', regex=True).astype('int64')

In [6]:
X = pd.get_dummies(X,columns=['Make','Model','Type','Origin', 'DriveTrain'])

In [7]:
X.shape, y.shape

((426, 481), (426,))

In [8]:
def defaultwork(est, X=X, y=y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    est.fit(X_train, y_train)
    print('Train Score: ',est.score(X_train, y_train))
    print('Test Score: ',est.score(X_test, y_test))


In [9]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [10]:
defaultwork(BaggingRegressor(random_state=1))

Train Score:  0.9567015457407546
Test Score:  0.7949163237522345


In [11]:
defaultwork(RandomForestRegressor(random_state=1))

Train Score:  0.978182315656497
Test Score:  0.8369440882741959


In [12]:
defaultwork(GradientBoostingRegressor(random_state=1))

Train Score:  0.9841884742397355
Test Score:  0.8270485242007507


In [13]:
defaultwork(AdaBoostRegressor(random_state=1))

Train Score:  0.9008410947504886
Test Score:  0.711903287964231


In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [16]:
gcv = GridSearchCV(estimator=AdaBoostRegressor(random_state=1), param_grid={'n_estimators': [100,500,1000], 
                    'learning_rate': [0.5,1,2]}, cv=4)

In [17]:
gcv.fit(X_train, y_train)

In [18]:
gcv.score(X_test, y_test)

0.7130800769863186

In [19]:
gcv.best_params_

{'learning_rate': 1, 'n_estimators': 1000}