# Step 1: Set up the environment and import libraries

In [26]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tabulate import tabulate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 2: Load data and Explore

In [2]:
melbourne_data =pd.read_csv("house_price_regression_dataset.csv")

In [3]:
pd.options.display.float_format = "{:,.2f}".format
print(melbourne_data.head())

   Square_Footage  Num_Bedrooms  Num_Bathrooms  Year_Built  Lot_Size  \
0            1360             2              1        1981      0.60   
1            4272             3              3        2016      4.75   
2            3592             1              2        2016      3.63   
3             966             1              2        1977      2.73   
4            4926             2              1        1993      4.70   

   Garage_Size  Neighborhood_Quality  House_Price  
0            0                     5   262,382.85  
1            1                     6   985,260.85  
2            0                     9   777,977.39  
3            1                     8   229,698.92  
4            0                     8 1,041,740.86  


**More Options :**

```


#Ensure House_Price remains a float  
df["House_Price"] = df["House_Price"].replace(',', '', regex=True).astype(float)

#Apply formatting for better readability  
#Keeps float, just changes display    
pd.options.display.float_format = "{:,.2f}".format

#Reset display format (or use scientific notation explicitly)    
#Default format (scientific notation for large numbers)    
pd.reset_option("display.float_format")

print(melbourne_data.head())


```

In [4]:
melbourne_data.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.6,0,5,262382.85
1,4272,3,3,2016,4.75,1,6,985260.85
2,3592,1,2,2016,3.63,0,9,777977.39
3,966,1,2,1977,2.73,1,8,229698.92
4,4926,2,1,1993,4.7,0,8,1041740.86


In [5]:
melbourne_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Square_Footage        1000 non-null   int64  
 1   Num_Bedrooms          1000 non-null   int64  
 2   Num_Bathrooms         1000 non-null   int64  
 3   Year_Built            1000 non-null   int64  
 4   Lot_Size              1000 non-null   float64
 5   Garage_Size           1000 non-null   int64  
 6   Neighborhood_Quality  1000 non-null   int64  
 7   House_Price           1000 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


In [6]:
melbourne_data.columns

Index(['Square_Footage', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built',
       'Lot_Size', 'Garage_Size', 'Neighborhood_Quality', 'House_Price'],
      dtype='object')

**Checking For Missing Values :**

In [7]:
# Count missing values per column
filtered_melbourne_data=melbourne_data.isnull().sum()
print(filtered_melbourne_data)

Square_Footage          0
Num_Bedrooms            0
Num_Bathrooms           0
Year_Built              0
Lot_Size                0
Garage_Size             0
Neighborhood_Quality    0
House_Price             0
dtype: int64


# Step 3: Preprocess Data

SELECT FEATURES AND TARGET

In [8]:
# Choose target and features
y = melbourne_data.House_Price

# selecting features
melbourne_features = ['Square_Footage', 'Num_Bedrooms', 'Year_Built','Lot_Size', 'Garage_Size']

x = melbourne_data[melbourne_features]

Split into Training & Testing Sets

In [9]:
# Split into Training & Testing Sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

print("# VALIDATION FEATURES DATA")
print("\n",x_test[:10])

print("\n# VALIDATION TARGET DATA\n")
print(y_test[:10])


# VALIDATION FEATURES DATA

      Square_Footage  Num_Bedrooms  Year_Built  Lot_Size  Garage_Size
507            2031             3        1993      4.73            2
818            1754             2        1998      0.92            2
452            1667             1        1954      1.37            1
368            4230             3        1951      3.08            1
242            3854             2        1974      3.27            1
929            1222             4        1974      2.64            0
262            1199             1        1953      2.46            0
810            4518             4        1971      1.38            2
318             504             3        1961      4.96            0
49             3652             1        1994      2.81            1

# VALIDATION TARGET DATA

507   501,692.85
818   398,823.27
452   302,975.30
368   874,856.27
242   811,129.50
929   291,123.87
262   211,038.85
810   926,094.14
318   149,743.50
49    793,995.28
Name: House_Pri

# Step 4: Train, Predict And Validate

**Train**

In [10]:
# Define model
melbourne_model =RandomForestRegressor(random_state=1)
# Fit model
melbourne_model.fit(x_train, y_train)

**Make Predictions**

**Method 1:**

In [11]:
# Predicte price on validation data
val_predictions = melbourne_model.predict(x_test)
print(val_predictions[:5])  # Not necessarily needed

[496213.71379394 399328.67767243 365835.67065847 895473.52711981
 821347.50791536]


**Method 2 :**
```
zxn=val_predictions[:5]
for i in range(len(zxn)):
    print(zxn[i])
    
OUTPUT >>>
475471.1766686792
405940.62394306617
344363.2736819858
926488.454118578
838719.430503504

```

# Step 5: Compare Actual vs Predicted Prices

**METHOD 1 :**

In [12]:
df_results = pd.DataFrame({"Actual": y_test[0:10], "Predicted": val_predictions[0:10]})

df_results = tabulate(df_results, headers='keys', tablefmt='psql')

print(df_results)

+-----+----------+-------------+
|     |   Actual |   Predicted |
|-----+----------+-------------|
| 507 |   501693 |      496214 |
| 818 |   398823 |      399329 |
| 452 |   302975 |      365836 |
| 368 |   874856 |      895474 |
| 242 |   811129 |      821348 |
| 929 |   291124 |      296399 |
| 262 |   211039 |      266190 |
| 810 |   926094 |      949044 |
| 318 |   149743 |      160931 |
|  49 |   793995 |      793429 |
+-----+----------+-------------+


**Method 2:**


```
df_results = pd.DataFrame({"Actual": y_test[:5], "Predicted": val_predictions[:5]})
print(df_results)

OUTPUT >>
      Actual       Predicted
507   501,692.85   475,471.18
818   398,823.27   405,940.62
452   302,975.30   344,363.27
368   874,856.27   926,488.45
242   811,129.50   838,719.43
```



#Step 6:  Evaluate the Model

**VALIDATE**

**✔ Use Evaluation Metrics:**

In [13]:
mae = mean_absolute_error(y_test, val_predictions)
mse = mean_squared_error(y_test, val_predictions)
r2 = r2_score(y_test, val_predictions)

print(f"Mean Absolute Error: {mae}\n")
print(f"Mean Squared Error: {mse}\n")

# Closer to 1 means a better model --- [exception : overfitting]
print(f"R² Score: {r2}\n")

Mean Absolute Error: 16695.064066293333

Mean Squared Error: 449309927.3415312

R² Score: 0.9925225015560999



**TESTING ON NEW DATA**

In [16]:
# Your new house data as a NumPy array
new_house_x = np.array([[4615,4,2000,1.7211468359255475,1]])

# Define the same feature names as used during training
feature_names_x = ['Square_Footage',"Num_Bedrooms","Year_Built",'Lot_Size','Garage_Size']

# Convert the NumPy array to a DataFrame with the correct column names
new_house_df_x = pd.DataFrame(new_house_x, columns=feature_names_x)

# Use the DataFrame for prediction
predicted_price_x = melbourne_model.predict(new_house_df_x)
print(f"Predicted Price: ${predicted_price_x[0]:,.2f}")


Predicted Price: $992,068.09


# OPTIMIZING THE MODEL

**Controlling The Tree Depth**

**control overfitting vs underfitting using max_leaf_nodes.**


**Finding Best Tree Size**

In [17]:
def get_mae(max_leaf_nodes, x_train, x_test, y_train, y_test):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(x_train, y_train)
    preds_val = model.predict(x_test)
    mae = mean_absolute_error(y_test, preds_val)
    return mae

# Different values of max_leaf_nodes to test
max_nodes = [128,129,130,131,132,133]

scores = {}  # Initialize an empty dictionary

# Compare MAE with different max_leaf_nodes values
for leaf_size in max_nodes:
    mae = get_mae(leaf_size, x_train, x_test, y_train, y_test)  # Compute MAE
    scores[leaf_size] = mae  # Store in dictionary
    print(f"Max leaf nodes: {leaf_size:3d}  \t Mean Absolute Error: {mae:.2f}")

# Find the best value of max_leaf_nodes
best_tree_size = min(scores, key=scores.get)
print(f"\nBest Tree Size: {best_tree_size}")


Max leaf nodes: 128  	 Mean Absolute Error: 17600.49
Max leaf nodes: 129  	 Mean Absolute Error: 17594.31
Max leaf nodes: 130  	 Mean Absolute Error: 17591.65
Max leaf nodes: 131  	 Mean Absolute Error: 17581.17
Max leaf nodes: 132  	 Mean Absolute Error: 17577.38
Max leaf nodes: 133  	 Mean Absolute Error: 17570.03

Best Tree Size: 133


# Train, Predict And Validate

**TRAIN**

In [18]:
# Specifying final model
final_model = RandomForestRegressor(max_leaf_nodes=best_tree_size, random_state=1)
# fit the final model
final_model.fit(x_train, y_train)

**PREDICT**

In [19]:
# Predicte price on validation data
final_val_predictions = final_model.predict(x_test)
print(final_val_predictions[:5])

[497517.95949713 405396.66410737 366613.24895213 896335.93823287
 818625.00012618]


**COMPARE WITH ACTUAL PRICE**

In [20]:
final_df_results = pd.DataFrame({"Actual": y_test[0:10], "Predicted": final_val_predictions[0:10]})

final_df_results = tabulate(final_df_results, headers='keys', tablefmt='psql')

print(final_df_results)

+-----+----------+-------------+
|     |   Actual |   Predicted |
|-----+----------+-------------|
| 507 |   501693 |      497518 |
| 818 |   398823 |      405397 |
| 452 |   302975 |      366613 |
| 368 |   874856 |      896336 |
| 242 |   811129 |      818625 |
| 929 |   291124 |      292873 |
| 262 |   211039 |      270675 |
| 810 |   926094 |      947356 |
| 318 |   149743 |      165286 |
|  49 |   793995 |      793967 |
+-----+----------+-------------+


**VALIDATE**

In [21]:
mae = mean_absolute_error(y_test, final_val_predictions)
mse = mean_squared_error(y_test, final_val_predictions)
r2 = r2_score(y_test, final_val_predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

# Closer to 1 means a better model --- [exception : overfitting]
print(f"R² Score: {r2}")

Mean Absolute Error: 17570.02581049342
Mean Squared Error: 486291600.3237427
R² Score: 0.9919070457529899


#Step 8: Test with New Data



```
np.array :
#----------------------------------------------------------------#
new_house = np.array([[1700, 3, 2020]])  
predicted_price = model.predict(new_house)
print(f"Predicted Price: ${predicted_price[0]:,.2f}")
# ---------------------------------------------------------------#
```



In [22]:
# Your new house data as a NumPy array
new_house = np.array([[4615,4,2000,1.7211468359255475,1]])

# Define the same feature names as used during training
feature_names = ['Square_Footage',"Num_Bedrooms","Year_Built",'Lot_Size','Garage_Size']

# Convert the NumPy array to a DataFrame with the correct column names
new_house_df = pd.DataFrame(new_house, columns=feature_names)

# Use the DataFrame for prediction
predicted_price = final_model.predict(new_house_df)
print(f"Predicted Price: ${predicted_price[0]:,.2f}")


Predicted Price: $994,766.49


#Step 9: Save the Model in Colab

**For a Pickle model (.pkl):**

In [24]:
import pickle

# Save model
with open("house_price_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

#Step 10: Download the File to Your PC

In [25]:
from google.colab import files
files.download("house_price_model.pkl")  # For Pickle
# OR
#files.download("house_price_model.h5")   # For Keras model

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>