In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
data = pd.read_csv('hauses.csv')


In [5]:
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [6]:
data = data.rename(columns={'MEDV':'PRICE'})

data['PRICE'] = data['PRICE'] * 1000

data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24000.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21600.0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34700.0
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33400.0
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36200.0


In [7]:
X = data.drop(columns=['PRICE'])
y = data['PRICE']

In [8]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [9]:
y.head()

0    24000.0
1    21600.0
2    34700.0
3    33400.0
4    36200.0
Name: PRICE, dtype: float64

In [10]:
#prepare datasets for modle training/testing
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
model_tree_regressor = DecisionTreeRegressor()
model_linear_regressor = LinearRegression()
model_forest_regressor = RandomForestRegressor()

In [12]:
model_tree_regressor.fit(X_train,y_train)

In [13]:
model_linear_regressor.fit(X_train,y_train)

In [14]:
model_forest_regressor.fit(X_train,y_train)

In [15]:
# Make predictions
predict_tree_regressor = model_tree_regressor.predict(X_test)

In [16]:
# Make predictions
predict_linear_regressor = model_linear_regressor.predict(X_test)

In [17]:
# Make predictions
predict_forest_regressor = model_forest_regressor.predict(X_test)

In [18]:
compare_tree_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_tree_regressor})

compare_tree_regressor

Unnamed: 0,real,predicted
427,10900.0,11700.0
186,50000.0,41700.0
219,23000.0,23800.0
120,22000.0,20300.0
289,24800.0,24100.0
...,...,...
286,20100.0,29600.0
293,23900.0,22800.0
270,21100.0,20000.0
33,13100.0,15600.0


In [19]:
compare_linear_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_linear_regressor})

compare_linear_regressor

Unnamed: 0,real,predicted
427,10900.0,12576.514431
186,50000.0,35439.248981
219,23000.0,30344.721755
120,22000.0,21646.683055
289,24800.0,26945.914578
...,...,...
286,20100.0,20025.652804
293,23900.0,25762.119774
270,21100.0,22093.098677
33,13100.0,14018.515806


In [20]:
compare_forest_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_forest_regressor})

compare_forest_regressor

Unnamed: 0,real,predicted
427,10900.0,16204.0
186,50000.0,40861.0
219,23000.0,21880.0
120,22000.0,21429.0
289,24800.0,24797.0
...,...,...
286,20100.0,24464.0
293,23900.0,22976.0
270,21100.0,21190.0
33,13100.0,14707.0


In [21]:
# Calculate performance metrics
mse_tree_regressor = mean_squared_error(y_test, predict_tree_regressor)
r2_tree_regressor = r2_score(y_test, predict_tree_regressor)
    
print(f"{type(model_tree_regressor).__name__} - Mean Squared Error: {mse_tree_regressor:.2f}, R^2 Score: {r2_tree_regressor:.2f}")


DecisionTreeRegressor - Mean Squared Error: 18835784.31, R^2 Score: 0.77


In [22]:
# Calculate performance metrics
mse_linear_regressor = mean_squared_error(y_test, predict_linear_regressor)
r2_linear_regressor = r2_score(y_test, predict_linear_regressor)
    
print(f"{type(model_linear_regressor).__name__} - Mean Squared Error: {mse_linear_regressor:.2f}, R^2 Score: {r2_linear_regressor:.2f}")

LinearRegression - Mean Squared Error: 22254184.93, R^2 Score: 0.73


In [23]:
# Calculate performance metrics
mse_forest_regressor = mean_squared_error(y_test, predict_forest_regressor)
r2_forest_regressor = r2_score(y_test, predict_forest_regressor)
    
print(f"{type(model_forest_regressor).__name__} - Mean Squared Error: {mse_forest_regressor:.2f}, R^2 Score: {r2_forest_regressor:.2f}")

RandomForestRegressor - Mean Squared Error: 12365198.60, R^2 Score: 0.85
