In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import os
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_final_performance(model, val_loader, device):
    """
    Loads the best model state and evaluates on the validation set.
    """
    print("Loading best model weights for evaluation...")
    model.load_state_dict(torch.load('new_model.pth'))
    model.eval()
    
    actuals = []
    predictions = []

    with torch.no_grad():
        for images, tabular, targets in val_loader:
            images = images.to(device)
            tabular = tabular.to(device)
            
            # Predict
            outputs = model(images, tabular)
            
            # Inverse Log Transform (getting back to real prices)
            # exp(pred) - 1
            preds_exp = np.expm1(outputs.cpu().numpy().flatten())
            targets_exp = np.expm1(targets.cpu().numpy().flatten())
            
            predictions.extend(preds_exp)
            actuals.extend(targets_exp)
            
    # Calculate Metrics
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    r2 = r2_score(actuals, predictions)
    
    print("\n" + "="*40)
    print(" FINAL BEST MODEL RESULTS (Validation Set)")
    print("="*40)
    print(f" RMSE     : ${rmse:,.2f}")
    print(f" RÂ² Score : {r2:.4f}")
    print("="*40 + "\n")
    
    return rmse, r2

# --- Configuration ---
BATCH_SIZE = 64
LEARNING_RATE = 1e-3 
EPOCHS = 30 # Increased epochs for better convergence
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. Dataset Class ---
class RealEstateDataset(Dataset):
    def __init__(self, csv_file, transform=None, is_test=False):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.is_test = is_test
        
        # Exclude non-feature columns
        exclude = ['id', 'date', 'price', 'log_price', 'image_path', 'date_int']
        self.feature_cols = [c for c in self.data.columns if c not in exclude]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Load Image
        img_path = str(row['image_path'])
        # Handle missing/nan paths
        if img_path != 'nan' and img_path != 'None' and os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                image = Image.new('RGB', (224, 224), color=(0, 0, 0))
        else:
            image = Image.new('RGB', (224, 224), color=(0, 0, 0))
            
        if self.transform:
            image = self.transform(image)
            
        # Load Tabular
        tabular = torch.tensor(row[self.feature_cols].values.astype(np.float32))
        
        if self.is_test:
            return image, tabular, row['id']
        else:
            target = torch.tensor(row['log_price'], dtype=torch.float32)
            return image, tabular, target

# --- 2. Improved Model Architecture ---
class MultimodalNet(nn.Module):
    def __init__(self, num_tabular_features):
        super(MultimodalNet, self).__init__()
        
        # --- Image Branch ---
        # Try loading weights, fallback to pretrained=True
        try:
            from torchvision.models import ResNet18_Weights
            self.cnn = models.resnet18(weights=ResNet18_Weights.DEFAULT)
        except:
            self.cnn = models.resnet18(pretrained=True)
            
        # FREEZE CNN Backbone
        # This is crucial so the CNN doesn't destabilize the tabular learning initially
        for param in self.cnn.parameters():
            param.requires_grad = False
            
        # Replace last fc layer to project to small dim
        num_ftrs = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_ftrs, 64) # Project 512 -> 64
        
        # --- Tabular Branch (Robust MLP) ---
        self.tabular_branch = nn.Sequential(
            nn.Linear(num_tabular_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU()
        )
        
        # --- Fusion ---
        # 64 (Image) + 64 (Tabular) = 128
        self.fusion = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1) 
        )
        
    def forward(self, image, tabular):
        x_img = self.cnn(image)
        x_tab = self.tabular_branch(tabular)
        
        # Combine
        x_combined = torch.cat((x_img, x_tab), dim=1)
        output = self.fusion(x_combined)
        return output

# --- 3. Training Loop ---
def train_pipeline():
    # Transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(), # Data Augmentation
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load Data
    train_dataset = RealEstateDataset('/kaggle/working/processed_train_new.csv', transform=transform)
    
    # Split Train/Val (80/20)
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Initialize Model
    num_tab_features = len(train_dataset.feature_cols)
    model = MultimodalNet(num_tab_features).to(DEVICE)
    
    # Optimizer & Scheduler
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-2)
    
    # Fixed Scheduler (Removed verbose=True to fix error)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    
    print("Starting Improved Training...")
    best_val_loss = float('inf')
    
    for epoch in range(EPOCHS):
        # TRAIN
        model.train()
        running_loss = 0.0
        for images, tabular, targets in train_loader:
            images, tabular, targets = images.to(DEVICE), tabular.to(DEVICE), targets.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(images, tabular)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
        train_loss = running_loss / len(train_loader)
        
        # VALIDATE
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, tabular, targets in val_loader:
                images, tabular, targets = images.to(DEVICE), tabular.to(DEVICE), targets.to(DEVICE)
                outputs = model(images, tabular)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        # Step Scheduler
        prev_lr = optimizer.param_groups[0]['lr']
        scheduler.step(val_loss)
        curr_lr = optimizer.param_groups[0]['lr']
        
        # Custom Print
        print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {curr_lr:.2e}")
        
        # Save Best
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'new_model.pth')
            
    print(f"Training Complete. Best Val Loss: {best_val_loss:.4f}")

    # --- NEW: CALCULATE RMSE & R2 ---
    # Load the best weights into the model structure we already have
    evaluate_final_performance(model, val_loader, DEVICE)

if __name__ == "__main__":
    train_pipeline()