**Decission Tree Regression**

In [1]:
# Load a sample dataset (e.g., California housing dataset, Iris, etc.) from sklearn's built-in datasets
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing


{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [2]:
# Load the dataset
import pandas as pd
housing = fetch_california_housing(as_frame=True)

# Extract full DataFrame
df = housing.frame    # ✅ Correct way to access it
print(df.head())      # Show first 5 rows


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [3]:
# Split into X (independent features) and y (target)
x = pd.DataFrame(housing.data, columns=housing.feature_names)  # Independent features
y = housing.target                                              # Target column

# Check sample
print(x.head())
print(y[:5])



   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64


In [24]:
# Train-test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=2)


In [25]:
# Train the model
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)


In [26]:
# Predictions
y_train_pred = regressor.predict(x_train)
y_test_pred = regressor.predict(x_test)



In [27]:
y_train_pred


array([1.946, 0.75 , 1.125, ..., 1.926, 1.535, 0.811])

In [28]:
y_test_pred


array([3.356, 2.206, 0.883, ..., 2.646, 2.518, 2.185])

In [29]:
# R2 Scores
from sklearn.metrics import r2_score
train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

print("Training R2 Score:", train_score)
print("Testing R2 Score:", test_score)

Training R2 Score: 1.0
Testing R2 Score: 0.6023089159422077


**HyperParameter Tunning**

In [34]:
#Define parameter grid for hyperparameter tuning
parameter = {
    'criterion': ['squared_error', 'absolute_error'], 
    'splitter': ['best', 'random'],
    'max_depth': [2, 4, 6, 8, 10, 12],
    'max_features': ['sqrt', 'log2'],
}
#Initialize DecisionTreeRegressor
regressor = DecisionTreeRegressor()


In [35]:
#GridSearchCV for hyperparameter tuning
from sklearn.model_selection import train_test_split, GridSearchCV
regressorc = GridSearchCV(estimator=regressor, param_grid=parameter, cv=5, scoring='r2')

regressorc.fit(x_train, y_train)


In [36]:
#Best model and its parameters
print("Best Parameters:", regressorc.best_params_)
print("Best Cross-Validation R² Score:", regressorc.best_score_)


Best Parameters: {'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'log2', 'splitter': 'best'}
Best Cross-Validation R² Score: 0.6525275349721141


In [37]:
y_train_pred = regressorc.predict(x_train)
y_test_pred = regressorc.predict(x_test)

train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

print("Training R2 Score:", train_score)
print("Testing R2 Score:", test_score)

Training R2 Score: 0.78093151046124
Testing R2 Score: 0.6401736705789306
