In [1]:
# Task3_step3.py
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 📂 Paths
CSV_PATH = "Project/housing_data.csv"
IMAGE_FEATURES_PATH = "image_features.csv"

print("📥 Loading tabular data...")
data = pd.read_csv(CSV_PATH)
print(f"✅ Tabular Data Loaded: {data.shape}")

print("📥 Loading image features...")
image_features = pd.read_csv(IMAGE_FEATURES_PATH)
print(f"✅ Image Features Loaded: {image_features.shape}")

# 🧹 Select relevant tabular columns
tabular_cols = ['id', 'latitude', 'longitude', 'room_type', 'minimum_nights',
                'number_of_reviews', 'reviews_per_month',
                'calculated_host_listings_count', 'availability_365', 'price']

tabular_data = data[tabular_cols]

# 🔗 Merge tabular data with image features
merged = tabular_data.merge(image_features, on='id')
print(f"✅ Merged Dataset Shape: {merged.shape}")

# 🎯 Target and Features
y = merged['price']
X = merged.drop(columns=['price', 'id'])

# 🔍 Identify categorical and numeric columns
categorical_features = ['room_type']
numeric_features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
                    'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

# 🔧 Preprocessing (scale numeric, encode categorical)
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')  # keep the 2048 image features as is

# 🌲 Model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# 🔗 Build Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# 🏆 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("🚀 Training model...")
pipeline.fit(X_train, y_train)

# 📊 Predictions
y_pred = pipeline.predict(X_test)

# 📈 Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"✅ Model trained!")
print(f"📉 MAE: {mae:.2f}")
print(f"📉 RMSE: {rmse:.2f}")


📥 Loading tabular data...
✅ Tabular Data Loaded: (48895, 16)
📥 Loading image features...
✅ Image Features Loaded: (62, 2049)
✅ Merged Dataset Shape: (62, 2058)
🚀 Training model...
✅ Model trained!
📉 MAE: 35.75
📉 RMSE: 43.09
