# Housing Price Prediction

## Imports

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

## Exploration

In [13]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_len = len(train)
y = train["SalePrice"]
all_data = pd.concat([train.drop("SalePrice", axis=1), test], axis=0)

## Data Preprocessing

In [14]:
all_data["LotFrontage"] = all_data["LotFrontage"].fillna(all_data["LotFrontage"].median())
all_data = all_data.drop(["Alley", "PoolQC", "Fence", "MiscFeature", "FireplaceQu"], axis=1)


for col in all_data.select_dtypes("object").columns:
    all_data[col] = all_data[col].fillna("None")

# One-hot encoding
all_data = pd.get_dummies(all_data)

# Split back
X = all_data[:train_len]
X_test_final = all_data[train_len:]

## Dataset Split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.fillna(X_train.median(numeric_only=True))
X_val = X_val.fillna(X_train.median(numeric_only=True))

## Prediction on validation

In [16]:
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.2f}")

Validation RMSE: 26121.40


## Test Data Prediction

In [17]:
X = X.fillna(X.median(numeric_only=True))
X_test_final = X_test_final.fillna(X.median(numeric_only=True))

model.fit(X, y)
final_preds = model.predict(X_test_final)

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": final_preds
})
submission.to_csv("submission.csv", index=False)