In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image, UnidentifiedImageError
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.metrics import r2_score

# Ensure output directories exist
os.makedirs('/content/satellite-property-valuation/outputs', exist_ok=True)

# --- 1. MODEL ARCHITECTURE ---
class MultiModalNet(nn.Module):
    def __init__(self, num_tabular_features):
        super(MultiModalNet, self).__init__()
        
        # Image Branch: Pre-trained ResNet18
        self.cnn = models.resnet18(weights='IMAGENET1K_V1')
        num_filters = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_filters, 64) 
        
        # Tabular Branch: Multi-Layer Perceptron
        self.tabular_mlp = nn.Sequential(
            nn.Linear(num_tabular_features, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        
        # Fusion Head: Regressor
        self.regressor = nn.Sequential(
            nn.Linear(64 + 16, 32),
            nn.ReLU(),
            nn.Linear(32, 1) 
        )
        
    def forward(self, image, tabular_data):
        img_features = self.cnn(image)
        tab_features = self.tabular_mlp(tabular_data)
        combined = torch.cat((img_features, tab_features), dim=1)
        return self.regressor(combined)

# --- 2. DATASET CLASS ---
class MultiModalDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                 std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Select target and features
        tab_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'grade', 'condition']
        tabular_features = torch.tensor(row[tab_cols].values.astype(np.float32), dtype=torch.float32)
        
        if 'price' in self.data.columns:
            target = torch.tensor([row['price']], dtype=torch.float32)
        else:
            target = torch.tensor([0.0], dtype=torch.float32) 

        img_path = os.path.join(self.img_dir, f"{int(row['id'])}.jpg")
        try:
            image = Image.open(img_path).convert('RGB')
            image = self.transform(image)
        except:
            # Fallback for missing or corrupted images
            image = torch.zeros(3, 224, 224)
            
        return image, tabular_features, target


# --- 3. TRAINING CONFIGURATION ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize Data
train_ds = MultiModalDataset(csv_file='/content/satellite-property-valuation/data/processed/train_cleaned.csv', 
                            img_dir='/content/satellite-property-valuation/data/satellite_images/')
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

# Initialize Model, Loss, and Optimizer
model = MultiModalNet(num_tabular_features=6).to(device)
criterion = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# --- 4. TRAINING LOOP ---
epochs = 14
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_targets = []
    
    loop = tqdm(train_loader, leave=True)
    for images, tabs, targets in loop:
        images, tabs, targets = images.to(device), tabs.to(device), targets.to(device)
        
        optimizer.zero_grad() 
        outputs = model(images, tabs)
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(loss=loss.item())
        
        all_preds.append(outputs.detach().cpu().numpy())
        all_targets.append(targets.detach().cpu().numpy())

    # Calculate Epoch Metrics
    avg_loss = running_loss / len(train_loader)
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    rmse = np.sqrt(avg_loss) 
    r2 = r2_score(all_targets, all_preds) 
    
    print(f"Epoch {epoch+1} Summary - Loss: {avg_loss:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")


# Save the trained model
torch.save(model.state_dict(), '/content/satellite-property-valuation/outputs/multimodal_model.pth')

# --- 5. BASELINE MODEL ---
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('/content/satellite-property-valuation/data/processed/train_cleaned.csv')
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'grade', 'condition']
X = train_df[features]
y = train_df['price']

baseline_model = RandomForestRegressor(n_estimators=100, random_state=42)
baseline_model.fit(X, y)
baseline_preds = baseline_model.predict(X)
baseline_rmse = np.sqrt(mean_squared_error(y, baseline_preds))
baseline_r2 = r2_score(y, baseline_preds)

print(f"--- Tabular-Only Baseline ---")
print(f"RMSE: {baseline_rmse:.2f}")
print(f"R2 Score: {baseline_r2:.4f}")

# --- 6. GRAD-CAM ---
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
import matplotlib.pyplot as plt

def generate_gradcam(model, input_image, input_tabular, target_layer):
    cam = GradCAM(model=model, target_layers=[target_layer])
    grayscale_cam = cam(input_tensor=input_image.unsqueeze(0), targets=None)[0, :]
    img_np = input_image.permute(1, 2, 0).cpu().numpy()
    img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min())
    visualization = show_cam_on_image(img_np, grayscale_cam, use_rgb=True)
    return visualization

# --- 7. INFERENCE ---
test_ds = MultiModalDataset(csv_file='/content/satellite-property-valuation/data/processed/test_cleaned.csv', 
                           img_dir='/content/satellite-property-valuation/data/satellite_images/')
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False)

model.eval()
results = []

with torch.no_grad():
    for images, tabs, targets in tqdm(test_loader):
        images, tabs = images.to(device), tabs.to(device)
        preds = model(images, tabs)
        results.append(preds.item())

test_df = pd.read_csv('/content/satellite-property-valuation/data/processed/test_cleaned.csv')
output_df = pd.DataFrame({
    'id': test_df['id'],
    'predicted_price': results
})
output_df.to_csv('/content/satellite-property-valuation/outputs/predictions.csv', index=False)
print("Final predictions saved to outputs/predictions.csv")