In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### California House Pricing Dataset


In [3]:
from sklearn.datasets import fetch_california_housing
california_df = fetch_california_housing()

In [4]:
df = pd.DataFrame(california_df.data,columns=california_df.feature_names)
df['Target'] = california_df.target

print(df)

## Taking Sample Data
# df = df.sample(frac=0.25) 

#independent features
X = df.iloc[:,:-1]
#dependent features
y = df.iloc[:,-1]

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  Target  
0        -12

In [5]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
### train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

In [9]:
y_pred = regressor.predict(X_test)
y_pred

array([0.565  , 0.521  , 5.00001, ..., 1.33   , 1.389  , 5.00001])

In [8]:
y_test

20046    0.47700
3024     0.45800
15663    5.00001
20484    2.18600
9814     2.78000
          ...   
15316    1.66100
14772    0.93600
12870    1.07000
13476    1.30700
16123    3.07100
Name: Target, Length: 6812, dtype: float64

In [11]:
from sklearn.metrics import r2_score
score = r2_score(y_pred, y_test)
score

0.6013560250349212

In [13]:
## Hyperparameter Tunning
parameter = {
  'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
  'splitter':['best','random'],
  'max_depth':[1,2,3,4,5,6,7,8,10,11,12],
  'max_features':['auto', 'sqrt', 'log2']
}
regressor = DecisionTreeRegressor()

In [14]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV

regressorcv = GridSearchCV(regressor, param_grid=parameter, cv=2, scoring='neg_mean_squared_error')

In [15]:
regressorcv.fit(X_train, y_train)

In [None]:
regressorcv.best_params_

In [None]:
y_pred = regressorcv.predict(X_test)

In [None]:

r2_score(y_pred,y_test)