# House Price Prediction Model Development

## 1. Load Dataset
Please ensure `train.csv` from the House Prices dataset is in this directory.

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

# Selected features (6 out of recommended 9)
# Recommended: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, BedroomAbvGr, FullBath, YearBuilt, Neighborhood, SalePrice
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'Neighborhood']
target = 'SalePrice'

# Load the dataset
data_path = 'train.csv'

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print("Dataset loaded from train.csv")
else:
    print("train.csv not found. Generating synthetic dataset for demonstration...")
    # Generate synthetic data
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'OverallQual': np.random.randint(1, 10, n_samples),
        'GrLivArea': np.random.randint(500, 4000, n_samples),
        'GarageCars': np.random.randint(0, 4, n_samples),
        'FullBath': np.random.randint(1, 4, n_samples),
        'YearBuilt': np.random.randint(1950, 2023, n_samples),
        'Neighborhood': np.random.choice(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel'], n_samples),
        'SalePrice': np.random.randint(100000, 500000, n_samples)
    })

# Filter specific features
X = df[selected_features]
y = df[target]

print("Features selected:", selected_features)
print(X.head())

## 2. Data Preprocessing & Feature Selection
Selected Features:
- OverallQual (Numeric/Ordinal)
- GrLivArea (Numeric)
- GarageCars (Numeric)
- FullBath (Numeric)
- YearBuilt (Numeric)
- Neighborhood (Categorical)

In [None]:
# Preprocessing for numerical data
numerical_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_features = ['Neighborhood']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into train and test sets.")

## 3. Model Training (Random Forest)

In [None]:
# Train the model
clf.fit(X_train, y_train)
print("Model trained.")

## 4. Evaluation

In [None]:
# Preprocessing of validation data, get predictions
preds = clf.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, preds)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"RÂ²: {r2}")

## 5. Save Model

In [None]:
# Save the model
joblib.dump(clf, 'house_price_model.pkl')
print("Model saved to house_price_model.pkl")