In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
data = pd.read_csv('hauses.csv')


In [28]:
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [29]:
data = data.rename(columns={'MEDV':'PRICE'})

data['PRICE'] = data['PRICE'] * 1000

data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24000.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21600.0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34700.0
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33400.0
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36200.0


In [30]:
X = data.drop(columns=['PRICE'])
y = data['PRICE']

In [31]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [32]:
y.head()

0    24000.0
1    21600.0
2    34700.0
3    33400.0
4    36200.0
Name: PRICE, dtype: float64

In [33]:
#prepare datasets for modle training/testing
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
model_tree_regressor = DecisionTreeRegressor()
model_linear_regressor = LinearRegression()
model_forest_regressor = RandomForestRegressor()

In [35]:
model_tree_regressor.fit(X_train,y_train)

In [36]:
model_linear_regressor.fit(X_train,y_train)

In [37]:
model_forest_regressor.fit(X_train,y_train)

In [38]:
# Make predictions
predict_tree_regressor = model_tree_regressor.predict(X_test)

In [39]:
# Make predictions
predict_linear_regressor = model_linear_regressor.predict(X_test)

In [40]:
# Make predictions
predict_forest_regressor = model_forest_regressor.predict(X_test)

In [41]:
compare_tree_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_tree_regressor})

compare_tree_regressor

Unnamed: 0,real,predicted
74,24100.0,21400.0
318,23100.0,21000.0
147,14600.0,11800.0
226,37600.0,44800.0
114,18500.0,22700.0
...,...,...
322,20400.0,20900.0
370,50000.0,50000.0
251,24800.0,24600.0
381,10900.0,5000.0


In [42]:
compare_linear_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_linear_regressor})

compare_linear_regressor

Unnamed: 0,real,predicted
74,24100.0,25634.732541
318,23100.0,24364.168412
147,14600.0,8508.697033
226,37600.0,37221.557631
114,18500.0,25093.944807
...,...,...
322,20400.0,23103.001353
370,50000.0,33461.165092
251,24800.0,25073.629147
381,10900.0,18502.676479


In [43]:
compare_forest_regressor = pd.DataFrame({'real': y_test, 'predicted' : predict_forest_regressor})

compare_forest_regressor

Unnamed: 0,real,predicted
74,24100.0,23697.0
318,23100.0,22260.0
147,14600.0,14155.0
226,37600.0,47061.0
114,18500.0,21361.0
...,...,...
322,20400.0,21732.0
370,50000.0,38782.0
251,24800.0,27245.0
381,10900.0,11164.0


In [44]:
# Calculate performance metrics
mse_tree_regressor = mean_squared_error(y_test, predict_tree_regressor)
r2_tree_regressor = r2_score(y_test, predict_tree_regressor)
    
print(f"{type(model_tree_regressor).__name__} - Mean Squared Error: {mse_tree_regressor:.2f}, R^2 Score: {r2_tree_regressor:.2f}")


DecisionTreeRegressor - Mean Squared Error: 17529607.84, R^2 Score: 0.81


In [45]:
# Calculate performance metrics
mse_linear_regressor = mean_squared_error(y_test, predict_linear_regressor)
r2_linear_regressor = r2_score(y_test, predict_linear_regressor)
    
print(f"{type(model_linear_regressor).__name__} - Mean Squared Error: {mse_linear_regressor:.2f}, R^2 Score: {r2_linear_regressor:.2f}")

LinearRegression - Mean Squared Error: 24930451.93, R^2 Score: 0.74


In [46]:
# Calculate performance metrics
mse_forest_regressor = mean_squared_error(y_test, predict_forest_regressor)
r2_forest_regressor = r2_score(y_test, predict_forest_regressor)
    
print(f"{type(model_forest_regressor).__name__} - Mean Squared Error: {mse_forest_regressor:.2f}, R^2 Score: {r2_forest_regressor:.2f}")

RandomForestRegressor - Mean Squared Error: 10016793.87, R^2 Score: 0.89
