# Sydney House Price Analysis
Author: Rabin Lamichhane

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Load dataset
df = pd.read_csv("NSW_property_sales.csv")

In [None]:
# Quick overview
print("Dataset Info:")
print(df.info())
print("\nColumns:")
print(df.columns)
print("\nSample data:")
print(df.head())

In [None]:
# Filter Sydney suburbs
sydney_suburbs = [
    'Sydney', 'Parramatta', 'Blacktown', 'Strathfield',
    'Liverpool', 'Penrith', 'Auburn', 'Bankstown',
    'Hornsby', 'Ryde', 'Canterbury', 'Hurstville'
]
df_sydney = df[df['suburb'].isin(sydney_suburbs)].copy()
print(f"\nNumber of Sydney suburb records: {df_sydney.shape[0]}")

In [None]:
# Clean data
df_sydney = df_sydney.dropna(subset=['sale_price', 'bedrooms', 'bathrooms'])

In [None]:
# EDA - Average price per suburb
avg_price = df_sydney.groupby('suburb')['sale_price'].mean().sort_values(ascending=False)
plt.figure(figsize=(10,5))
sns.barplot(x=avg_price.index, y=avg_price.values)
plt.title('Average Sale Price per Sydney Suburb')
plt.ylabel('Average Sale Price ($)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of sale prices
plt.figure(figsize=(8,5))
sns.histplot(df_sydney['sale_price'], bins=50, kde=True)
plt.title('Distribution of Sale Prices')
plt.xlabel('Sale Price ($)')
plt.tight_layout()
plt.show()

In [None]:
# Bedrooms distribution
plt.figure(figsize=(8,5))
sns.countplot(x='bedrooms', data=df_sydney)
plt.title('Number of Bedrooms Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Prepare data for modeling
features = ['bedrooms', 'bathrooms']
X = df_sydney[features]
y = df_sydney['sale_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error on test set: ${rmse:,.2f}")

In [None]:
# Plot predicted vs actual prices
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Actual vs Predicted Sale Prices')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.tight_layout()
plt.show()