In [4]:
%pip install pandas scikit-learn joblib

Note: you may need to restart the kernel to use updated packages.
Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)
     -------------------------------------- 11.3/11.3 MB 575.2 kB/s eta 0:00:00
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl (8.9 MB)
     ---------------------------------------- 8.9/8.9 MB 809.3 kB/s eta 0:00:00
Collecting joblib
  Downloading joblib-1.5.3-py3-none-any.whl (309 kB)
     -------------------------------------- 309.1/309.1 kB 1.1 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
     ------------------------------------ 348.5/348.5 kB 983.4 kB/s eta 0:00:00
Collecting numpy>=1.22.4
  Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
     -------------------------------------- 12.9/12.9 MB 742.5 kB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     ----------------------------------


[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. Load Data
df = pd.read_csv('../dataset/housing.csv')

# 2. Professional Preprocessing
# The Kaggle dataset has some missing values in 'total_bedrooms'. We fill them.
df.dropna(inplace=True)

# Convert 'ocean_proximity' (Text) to Numbers using One-Hot Encoding
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)

# 3. Define X and y
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Model
print("Training Optimized Model...")
model = RandomForestRegressor(n_estimators=30, max_depth=20, random_state=42)
model.fit(X_train, y_train)

# 6. Evaluate
y_pred = model.predict(X_test)
print(f"Model Accuracy (R2 Score): {r2_score(y_test, y_pred):.2f}")

# 7. Save Model and Column Names (Important for Deployment)
joblib.dump(model, 'models/model.pkl', compress=3) 
joblib.dump(X.columns.tolist(), 'models/features.pkl') # Save feature names to ensure inputs match
print("Model saved successfully!")

Training Model... This might take a minute.
Model Accuracy (R2 Score): 0.83
Model saved successfully!
