In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the dataset
file_path = 'MELBOURNE_HOUSES_DATASET.csv' 
data = pd.read_csv(file_path)

# Step 1: Data Cleaning
data.drop(columns=['Address', 'Type', 'Method', 'Seller', 'Regionname', 'Propertycount', 'CouncilArea'], inplace=True)

# Step 2: Handle Date column (Extract year)
data['Year'] = pd.to_datetime(data['Date']).dt.year
data.drop(columns=['Date'], inplace=True)

# Step 3: Handle missing values
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Convert categorical data (like 'Suburb' and 'Postcode') to numeric using one-hot encoding
data = pd.get_dummies(data, columns=['Suburb', 'Postcode'], drop_first=True)

# Step 4: Prepare features and target
X = data.drop(columns=['Price'])  
y = data['Price']                 

# Step 5: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [11]:

# Step 6: Model Training
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Step 7: Model Prediction
y_pred = model.predict(X_test)

# Step 8: Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Interpreted "accuracy" as a percentage
accuracy = r2 * 100  # Convert R^2 score to a percentage

print("Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")
print(f"Model Accuracy: {accuracy:.2f}%")

# Optional: Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

Model Performance:
Mean Absolute Error (MAE): 206889.86581338831
Root Mean Squared Error (RMSE): 338330.73878406454
R-squared (R2): 0.6783124724283366
Model Accuracy: 67.83%

Feature Importance:
                     Feature  Importance
1                   Distance    0.384517
0                      Rooms    0.234339
2                       Year    0.037507
524            Postcode_3186    0.017643
47           Suburb_Brighton    0.014811
..                       ...         ...
461            Postcode_3114    0.000000
343  Suburb_Warrandyte South    0.000000
199     Suburb_Kilsyth South    0.000000
338      Suburb_Wandin North    0.000000
565            Postcode_3767    0.000000

[592 rows x 2 columns]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Predicted vs. Actual Prices Scatter Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  
plt.title("Predicted vs Actual Prices")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()

# 2. Residuals Histogram
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.show()

# 3. Learning Curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring="r2"
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training Score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Cross-Validation Score")
plt.title("Learning Curve")
plt.xlabel("Training Set Size")
plt.ylabel("R-squared Score")
plt.legend(loc="best")
plt.grid()
plt.show()