# Full Project: Amsterdam House Price Prediction
This project demonstrates a full pipeline from loading data, EDA, feature engineering, to training a model to predict house prices based on the dataset available at

Data source -> https://www.kaggle.com/datasets/thomasnibb/amsterdam-house-price-prediction

## Creation of the requirements.txt file

In [None]:
#pip freeze > requirements.txt

# Import / Data load

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Randomize seed for creation of different models
np.random.seed(42)

In [None]:
# Load the data
df = pd.read_csv('HousingPrices-Amsterdam-August-2021.csv')
df.head()

# Data exploration

In [None]:
# Basic EDA
print(df.info())
print('\nMissing values per column:')
print(df.isnull().sum())
df.describe()

In [None]:
# Price distribution
sns.histplot(df['Price'], bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.show()

In [None]:
# Correlation heatmap for numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(10, 6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# Feature Engineering

In [None]:
# Drop address, assume Zip and Type are important
df = df.dropna()
X = df.drop(['Price', 'Address'], axis=1)
y = df['Price']

In [None]:
# Preprocessing pipeline
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Model Creation / Training

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model pipeline and training
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42))
])

model.fit(X_train, y_train)

In [None]:
# Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RÂ² Score: {r2:.2f}")

In [None]:
# Prediction visualization
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.grid(True)
plt.show()

## Model Saving

In [None]:
# Save trained model
joblib.dump(model, 'amsterdam_house_price_model.pkl')