### Decision Tree Regressor Implementation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
##California housing Dataset
from sklearn.datasets import fetch_california_housing
california_df = fetch_california_housing()

In [3]:
X = pd.DataFrame(california_df.data,columns=california_df.feature_names)
y = california_df.target

In [4]:
##The above process could also be done in the following way
df  = pd.DataFrame(california_df.data,columns=california_df.feature_names)
df['Target'] = pd.DataFrame(california_df.target)

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
X = df.iloc[ :,: -1]
y = df.iloc[:,-1]

In [7]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Target, Length: 20640, dtype: float64

In [9]:
##train test split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=42,test_size=0.25)

In [10]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((15480, 8), (5160, 8), (15480,), (5160,))

In [11]:
print(california_df.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [12]:
##Create Decision Tree regressor object
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()

In [13]:
regressor.fit(X_train,y_train)

In [34]:
y_pred = regressor.predict(X_test)

NotFittedError: This DecisionTreeRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [35]:
from sklearn.metrics import accuracy_score,classification_report

In [36]:
y_pred.shape ,y_test.shape

((5160,), (5160,))

In [37]:
y_pred

array([0.425  , 0.521  , 5.00001, ..., 1.44   , 1.887  , 5.00001])

In [38]:
y_test

20046    0.47700
3024     0.45800
15663    5.00001
20484    2.18600
9814     2.78000
          ...   
5363     5.00001
19755    0.63200
4885     1.17700
13043    2.63100
8583     4.81500
Name: Target, Length: 5160, dtype: float64

In [39]:
accuracy_score(y_test,y_pred)  #note that accuracy score will not work here as it only works for discrete values or classifiers in output
##Use R2 or MAE kind of scores instead

ValueError: continuous is not supported

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)
print(score * 100,'%','Accuracy')

In [40]:
##To get better score we will now do hyperparameter tuning

In [48]:
parameter ={
    "criterion": ['squared_error','friedman_mse','absolute_error','poisson'],
    'splitter':['best','random'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10,11,12],
    'max_features' : ['auto','sqrt','log2']
}
regressor = DecisionTreeRegressor()  ##this is taking so much time so skip this and just take a few parameters

In [49]:
parameter ={
    "criterion": ['squared_error','poisson'],
    'splitter':['best','random'],
    'max_depth' : [1,5,6,7,8,9,10],
    'max_features' : ['auto','sqrt','log2']
}
regressor = DecisionTreeRegressor()

In [42]:
import warnings 
warnings.filterwarnings('ignore')


In [43]:
from sklearn.model_selection import GridSearchCV


In [46]:
regressor_cv = GridSearchCV(regressor,param_grid=parameter,cv = 2 ,scoring = 'neg_mean_squared_error')

In [47]:
regressor_cv.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
regressor_cv.best_params_

In [None]:
y_pred = regressor_cv.predict(X_test)

In [None]:
r2_score(y_pred,y_test)