In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
import datetime
from xgboost import XGBRegressor

# Load data
df = pd.read_csv("data.csv")
df.fillna(0, inplace=True)

# Convert and extract date features
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['month_sold'] = df['date'].dt.month.fillna(0).astype(int)

# Feature engineering
current_year = datetime.datetime.now().year
df['house_age'] = current_year - df['yr_built']
df['renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)
df['years_since_renovation'] = df.apply(lambda row: current_year - row['yr_renovated'] if row['yr_renovated'] > 0 else 0, axis=1)
df['living_to_lot_ratio'] = df['sqft_living'] / df['sqft_lot']
df['zip_code'] = df['statezip'].str.extract(r'(\d{5})').fillna("00000")
df['price_per_sqft'] = df['price'] / df['sqft_living']
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['is_basement'] = df['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)

# Drop outliers (optional)
df = df[df['price'] < df['price'].quantile(0.99)]  # Remove top 1% price outliers

# Features
features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront', 'view',
    'condition', 'sqft_basement', 'house_age', 'renovated', 'years_since_renovation',
    'living_to_lot_ratio', 'month_sold', 'zip_code', 'total_rooms', 'is_basement'
]
target = 'price'

# Prepare X and y
X = df[features]
y = df[target]

# Encoding
categorical_features = ['month_sold']
ordinal_features = ['zip_code']
numerical_features = list(set(features) - set(categorical_features) - set(ordinal_features))

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features),
    ],
    remainder='passthrough'
)

# Model pipeline using XGBoost
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

# Sample test case 
test_sample = pd.DataFrame({
    'bedrooms': [3],
    'bathrooms': [2],
    'sqft_living': [1800],
    'floors': [1],
    'waterfront': [0],
    'view': [0],
    'condition': [3],
    'sqft_basement': [200],
    'house_age': [10],
    'renovated': [0],
    'years_since_renovation': [0],
    'living_to_lot_ratio': [0.3],
    'month_sold': [7],
    'zip_code': ['98103'],   # 5-digit zip code string, same format as training
    'total_rooms': [5],      # bedrooms + bathrooms
    'is_basement': [1]
})
price = model.predict(test_sample)

print(f"Predicted price for test sample: {price[0]:.2f}")

R² Score: 0.6724
Predicted price for test sample: 612221.12
