In [71]:
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd

In [72]:
dataset = fetch_california_housing()
X = dataset.data
y = dataset.target
print(pd.DataFrame(X,y))

            0     1         2         3       4         5      6       7
4.526  8.3252  41.0  6.984127  1.023810   322.0  2.555556  37.88 -122.23
3.585  8.3014  21.0  6.238137  0.971880  2401.0  2.109842  37.86 -122.22
3.521  7.2574  52.0  8.288136  1.073446   496.0  2.802260  37.85 -122.24
3.413  5.6431  52.0  5.817352  1.073059   558.0  2.547945  37.85 -122.25
3.422  3.8462  52.0  6.281853  1.081081   565.0  2.181467  37.85 -122.25
...       ...   ...       ...       ...     ...       ...    ...     ...
0.781  1.5603  25.0  5.045455  1.133333   845.0  2.560606  39.48 -121.09
0.771  2.5568  18.0  6.114035  1.315789   356.0  3.122807  39.49 -121.21
0.923  1.7000  17.0  5.205543  1.120092  1007.0  2.325635  39.43 -121.22
0.847  1.8672  18.0  5.329513  1.171920   741.0  2.123209  39.43 -121.32
0.894  2.3886  16.0  5.254717  1.162264  1387.0  2.616981  39.37 -121.24

[20640 rows x 8 columns]


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


In [74]:
regressorTree = DecisionTreeRegressor()
regressor = LinearRegression()

In [75]:
param_grid = {
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
parameters = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

In [81]:
clf = GridSearchCV(regressorTree,param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search = GridSearchCV(regressor, param_grid={}, cv=5, scoring='neg_mean_squared_error')

In [82]:
clf.fit(X_train,y_train)
grid_search.fit(X_train,y_train)

In [83]:
best_model = clf.best_estimator_
best_params = clf.best_params_
best_models_linear = grid_search.best_estimator_

In [84]:
print(best_params)
print(best_models_linear)

{'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5}
LinearRegression()


In [85]:
y_pred = clf.predict(X_test)
y_pred_linear = grid_search.predict(X_test)

In [86]:
from sklearn.metrics import r2_score

In [87]:
r2 = r2_score(y_test,y_pred)
r2_linear = r2_score(y_test,y_pred_linear)

In [88]:
print("R squared error for decision tree: ",r2)
print("R squared error for linear regression: ",r2_linear)

R squared error for decision tree:  0.671844098767453
R squared error for linear regression:  0.5904023810375152
