<a href="https://colab.research.google.com/github/PaletteofDesign/ensemble_trees/blob/main/regression_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Create DataFrame
f = '/content/kc_house_data.csv'
df = pd.read_csv(f)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
# Review
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

# Regression Tree

In [5]:
# Altering object data to numerical data
df = pd.get_dummies(df, columns=['date'], drop_first=True)

In [6]:
# Creating target and features
y = df['price']
X = df.drop(columns='price')

In [7]:
# Creating train, test, split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [8]:
# Creating DTR
dec_tree = DecisionTreeRegressor(random_state=42)

In [9]:
# Fiting training data to model
dec_tree.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

In [10]:
# Obtaining predictions
train_preds = dec_tree.predict(X_train)
test_preds = dec_tree.predict(X_test)

In [11]:
# Obtaining scores - R2
train_score = dec_tree.score(X_train, y_train)
test_score = dec_tree.score(X_test, y_test)
print(train_score)
print(test_score)

1.0
0.7493659719668276


In [12]:
# Reviewing tree depth
dec_tree.get_depth()

43

In [13]:
# Reviewing number of leaves 
dec_tree.get_n_leaves()

15579

In [14]:
# Tuning according to tree depth and final results
dec_tree_2 = DecisionTreeRegressor(max_depth = 7, random_state = 42)
dec_tree_2.fit(X_train, y_train)
train_2_score = dec_tree_2.score(X_train, y_train)
test_2_score = dec_tree_2.score(X_test, y_test)
print('Final:')
print(train_2_score)
print(test_2_score)

Final:
0.8344898647016635
0.7626731909560042


# Bagged Tree

In [15]:
bagreg = BaggingRegressor(random_state=42)

In [16]:
bagreg.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=42, verbose=0,
                 warm_start=False)

In [17]:
bagreg.predict(X_test)

array([ 376015.,  823900., 1096790., ...,  272925., 2020750.,  301910.])

In [18]:
bagreg_train_score = bagreg.score(X_train, y_train)
bagreg_test_score = bagreg.score(X_test, y_test)
print(bagreg_train_score)
print(bagreg_test_score)

0.9768563409308696
0.8549968134630627


# Random forest regression techniques on the dataset to predict price from its features


In [19]:
rf = RandomForestRegressor(random_state=42)

In [20]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [21]:
rf.predict(X_test)

array([ 375907.  ,  843828.1 , 1099358.37, ...,  279652.65, 1972988.2 ,
        314426.64])

In [22]:
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)
print(rf_train_score)
print(rf_test_score)

0.9829313272396315
0.8652059235916127


In [31]:
rf_6 = RandomForestRegressor(max_depth = 6, random_state = 42)
rf_6.fit(X_train, y_train)
rf_6_train_score = rf_6.score(X_train, y_train)
rf_6_test_score = rf_6.score(X_test, y_test)
print(rf_6_train_score)
print(rf_6_test_score)

0.8372849797656579
0.7944038415367926
