In [1]:
# ---------------------------------------------------------
# BASELINE MODEL (Tabular Only)
# ---------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def check_baseline():
    print("\n" + "="*40)
    print(" BASELINE MODEL CHECK (Tabular Only)")
    print("="*40)
    
    # 1. Load Data
    train_df = pd.read_csv('train.csv')
    
    # 2. Simple Preprocessing
    # Convert date to number
    train_df['date'] = pd.to_datetime(train_df['date'])
    train_df['date_int'] = train_df['date'].astype(np.int64) // 10**9
    
    # Define X (Features) and y (Target)
    drop_cols = ['id', 'date', 'price']
    X = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
    y = np.log1p(train_df['price']) # Log-transform target for better accuracy
    
    # 3. Split Data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 4. Train Random Forest (Standard Baseline)
    rf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)
    rf.fit(X_train, y_train)
    
    # 5. Evaluate
    val_preds_log = rf.predict(X_val)
    val_preds = np.expm1(val_preds_log) # Reverse log transform
    y_val_actual = np.expm1(y_val)
    
    rmse = np.sqrt(mean_squared_error(y_val_actual, val_preds))
    r2 = r2_score(y_val_actual, val_preds)
    
    print(f" RMSE     : ${rmse:,.2f}")
    print(f" R² Score : {r2:.4f}")
    print("="*40 + "\n")

# Run the check
if __name__ == "__main__":
    check_baseline()


 BASELINE MODEL CHECK (Tabular Only)
 RMSE     : $133,392.89
 R² Score : 0.8582



In [None]:
#-----------------------------------------------------
# HYBRID MODEL (Tabular + Visual)
#-----------------------------------------------------
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt

# --- Configuration ---
BATCH_SIZE = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = '/kaggle/working/new_model.pth'

# --- 1. Define Dataset & Model for Extraction (Same as before) ---
class RealEstateDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = str(row['image_path'])
        if img_path != 'nan' and img_path != 'None' and os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                image = Image.new('RGB', (224, 224), color=(0, 0, 0))
        else:
            image = Image.new('RGB', (224, 224), color=(0, 0, 0))   
        if self.transform:
            image = self.transform(image)
        return image, row['id']

class MultimodalNet(nn.Module):
    def __init__(self):
        super(MultimodalNet, self).__init__()
        try:
            from torchvision.models import ResNet18_Weights
            self.cnn = models.resnet18(weights=ResNet18_Weights.DEFAULT)
        except:
            self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 64) 
        
    def forward(self, image):
        return self.cnn(image)

def extract_embeddings(csv_file, model):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    dataset = RealEstateDataset(csv_file, transform=transform)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print(f"Extracting features from {csv_file}...")
    all_ids = []
    all_embeddings = []
    with torch.no_grad():
        for images, ids in tqdm(loader):
            images = images.to(DEVICE)
            embeddings = model(images)
            all_embeddings.append(embeddings.cpu().numpy())
            all_ids.extend(ids.numpy())
            
    all_embeddings = np.vstack(all_embeddings)
    cols = ['id'] + [f'visual_{i}' for i in range(all_embeddings.shape[1])]
    data = np.column_stack((all_ids, all_embeddings))
    df = pd.DataFrame(data, columns=cols)
    df['id'] = df['id'].astype(int)
    return df

# --- 2. Advanced Feature Engineering ---
def add_spatial_clusters(train_df, test_df, n_clusters=20):
    print("Adding Geospatial Clusters...")
    # Combine coords to learn global clusters
    coords = pd.concat([train_df[['lat', 'long']], test_df[['lat', 'long']]])
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(coords)
    
    train_df['cluster'] = kmeans.predict(train_df[['lat', 'long']])
    test_df['cluster'] = kmeans.predict(test_df[['lat', 'long']])
    
    # One-hot encode the clusters? 
    # Tree models handle integers fine, but let's leave as int for now.
    return train_df, test_df

def add_interactions(df):
    # Interaction: Big House * High Grade = Exponential Value
    df['sqft_grade'] = df['sqft_living'] * df['grade']
    # Interaction: Living Area / Lot Area = Density
    df['density'] = df['sqft_living'] / (df['sqft_lot'] + 1)
    return df

# --- 3. Main Execution Flow ---
def run_hybrid_pipeline():
    # A. Feature Extraction
    print("--- Step 1: Visual Features ---")
    model = MultimodalNet().to(DEVICE)
    state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
    cnn_state_dict = {k.replace('cnn.', ''): v for k, v in state_dict.items() if 'cnn.' in k}
    model.cnn.load_state_dict(cnn_state_dict)
    model.eval()
    
    train_visual = extract_embeddings('/kaggle/working/processed_train_new.csv', model).drop_duplicates(subset=['id'])
    test_visual = extract_embeddings('/kaggle/working/processed_test_new.csv', model).drop_duplicates(subset=['id'])

    # B. Merge
    print("--- Step 2: Merging & Engineering ---")
    train_df = pd.read_csv('/kaggle/input/new-dataset/train.csv')
    test_df = pd.read_csv('/kaggle/input/new-dataset/test.csv')
    
    # Add Geospatial Clusters (NEW)
    train_df, test_df = add_spatial_clusters(train_df, test_df)
    
    train_full = pd.merge(train_df, train_visual, on='id', how='left')
    test_full = pd.merge(test_df, test_visual, on='id', how='left')

    # C. Preprocess
    def preprocess(df):
        df = df.copy()
        df['date'] = pd.to_datetime(df['date'])
        df['date_int'] = df['date'].astype(np.int64) // 10**9
        
        # Add Interaction Features (NEW)
        df = add_interactions(df)
        
        drop_cols = ['id', 'date', 'price']
        cols = [c for c in df.columns if c not in drop_cols]
        df[cols] = df[cols].fillna(0)
        return df[cols]

    X = preprocess(train_full)
    y = np.log1p(train_full['price'])
    X_test = preprocess(test_full)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # D. Ensemble Training (NEW)
    print("\n--- Step 3: Training Ensemble (Voting Regressor) ---")
    
    # Model 1: HistGradientBoosting (Fast & Accurate)
    hgb = HistGradientBoostingRegressor(learning_rate=0.05, max_iter=1000, random_state=42)
    
    # Model 2: XGBoost (The King of Tabular)
    xgb_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1, random_state=42)
    
    # Model 3: Random Forest (Robustness)
    rf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)
    
    # Voting Regressor (Averages the predictions)
    # We give slightly more weight to Gradient Boosting methods
    ensemble = VotingRegressor(
        estimators=[('hgb', hgb), ('xgb', xgb_model), ('rf', rf)],
        weights=[2, 2, 1] 
    )
    
    ensemble.fit(X_train, y_train)
    
    # Evaluate
    val_preds = np.expm1(ensemble.predict(X_val))
    y_val_actual = np.expm1(y_val)
    
    rmse = np.sqrt(mean_squared_error(y_val_actual, val_preds))
    r2 = r2_score(y_val_actual, val_preds)
    
    print("\n" + "="*40)
    print(" FINAL ENSEMBLE RESULTS")
    print("="*40)
    print(f" RMSE     : ${rmse:,.2f}")
    print(f" R² Score : {r2:.4f}")
    print("="*40 + "\n")
    
    # E. Save Submission
    test_preds = np.expm1(ensemble.predict(X_test))
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        'predicted_price': test_preds
    })
    submission.to_csv('final_predictions.csv', index=False)
    print("Ensemble Predictions saved to 'final_predictions.csv'.")
    
    # F. Feature Importance (Using XGBoost for visualization)
    print("Generating Feature Importance Plot...")
    xgb_model.fit(X_train, y_train) # Fit single model to get importance
    xgb.plot_importance(xgb_model, max_num_features=15, height=0.5)
    plt.title('Top 15 Features (XGBoost)')
    plt.tight_layout()
    plt.savefig('hybrid_feature_importance_new.png')
    print("Plot saved.")

if __name__ == "__main__":
    run_hybrid_pipeline()