In [5]:
# House Price Prediction - Model Building
# Selected Features: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, YearBuilt, Neighborhood

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# 1. Load the dataset
print("Loading dataset...")
df = pd.read_csv('train.csv')

# 2. Feature Selection - Using 6 features from the recommended 9
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
target = 'SalePrice'

# Create working dataframe
data = df[selected_features + [target]].copy()

print(f"\nDataset shape: {data.shape}")
print(f"\nSelected features: {selected_features}")
print(f"\nData info:")
print(data.info())

# 3. Data Preprocessing

# a. Handling missing values
print("\n--- Handling Missing Values ---")
print("Missing values before:")
print(data.isnull().sum())

# Fill missing values
data['TotalBsmtSF'].fillna(0, inplace=True)  # No basement = 0
data['GarageCars'].fillna(0, inplace=True)    # No garage = 0

print("\nMissing values after:")
print(data.isnull().sum())

# b. Encoding categorical variables (Neighborhood)
print("\n--- Encoding Categorical Variables ---")
le = LabelEncoder()
data['Neighborhood_Encoded'] = le.fit_transform(data['Neighborhood'])

# Save the label encoder for later use
joblib.dump(le, 'neighborhood_encoder.pkl')
print(f"Neighborhood categories: {len(le.classes_)}")

# Drop original Neighborhood column
data = data.drop('Neighborhood', axis=1)

# Prepare features and target
X = data.drop(target, axis=1)
y = data[target]

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# c. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# d. Feature Scaling
print("\n--- Feature Scaling ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved successfully")

# 4. Model Training - Random Forest Regressor
print("\n--- Training Random Forest Regressor ---")
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("Model training completed!")

# 5. Model Evaluation
print("\n--- Model Evaluation ---")

# Predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate metrics for training set
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for test set
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("\nTraining Set Metrics:")
print(f"MAE:  ${train_mae:,.2f}")
print(f"MSE:  ${train_mse:,.2f}")
print(f"RMSE: ${train_rmse:,.2f}")
print(f"R²:   {train_r2:.4f}")

print("\nTest Set Metrics:")
print(f"MAE:  ${test_mae:,.2f}")
print(f"MSE:  ${test_mse:,.2f}")
print(f"RMSE: ${test_rmse:,.2f}")
print(f"R²:   {test_r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# 6. Save the trained model
print("\n--- Saving Model ---")
joblib.dump(model, 'house_price_model.pkl')
print("Model saved as 'house_price_model.pkl'")

# Save feature names for later use
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')
print("Feature names saved")

# Test reloading the model
print("\n--- Testing Model Reload ---")
loaded_model = joblib.load('house_price_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
loaded_encoder = joblib.load('neighborhood_encoder.pkl')

# Make a test prediction
test_sample = X_test_scaled[0:1]
prediction = loaded_model.predict(test_sample)
actual = y_test.iloc[0]

print(f"Test prediction: ${prediction[0]:,.2f}")
print(f"Actual price: ${actual:,.2f}")
print(f"Difference: ${abs(prediction[0] - actual):,.2f}")

print("\n✓ Model successfully saved and reloaded!")
print("\nFiles created:")
print("- house_price_model.pkl")
print("- scaler.pkl")
print("- neighborhood_encoder.pkl")
print("- feature_names.pkl")


Loading dataset...

Dataset shape: (1460, 7)

Selected features: ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   OverallQual   1460 non-null   int64 
 1   GrLivArea     1460 non-null   int64 
 2   TotalBsmtSF   1460 non-null   int64 
 3   GarageCars    1460 non-null   int64 
 4   YearBuilt     1460 non-null   int64 
 5   Neighborhood  1460 non-null   object
 6   SalePrice     1460 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 80.0+ KB
None

--- Handling Missing Values ---
Missing values before:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64

Missing values after:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBu

### Uploading `train.csv`

Run the cell below to upload the `train.csv` file. A file selection dialog will appear.

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')


Saving train.csv to train (1).csv
User uploaded file "train (1).csv" with length 460676 bytes


In [None]:
from google.colab import drive
drive.mount('/content/drive')