In [None]:
!pip install requests pillow tqdm



In [None]:
import requests
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
from tqdm import tqdm

In [None]:
MAPBOX_TOKEN = "pk.eyJ1Ijoic3RhcGh5LTI1IiwiYSI6ImNtamcwdTc5dDBzY2gzY3M2eTNjOWVmcGoifQ.F2O-RZ8eE9XVS6sqUTFAeg"

In [None]:
df = pd.read_csv("/train_Dataset.csv")
df = df.reset_index(drop=True)

print(df.head())
print(len(df))

           id             date  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2591820310  20141006T000000         4       2.25         2070      8893   
1  7974200820  20140821T000000         5       3.00         2900      6730   
2  7701450110  20140815T000000         4       2.50         3770     10893   
3  9522300010  20150331T000000         3       3.50         4560     14608   
4  9510861140  20140714T000000         3       2.50         2550      5376   

   floors  waterfront  view  condition  grade  sqft_above  sqft_basement  \
0     2.0           0     0          4      8        2070              0   
1     1.0           0     0          5      8        1830           1070   
2     2.0           0     2          3     11        3770              0   
3     2.0           0     2          3     12        4560              0   
4     2.0           0     0          3      9        2550              0   

   yr_built  yr_renovated  zipcode      lat     long  sqft_living15  \
0  

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = torch.nn.Identity()
model = model.to(device)
model.eval()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 194MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [None]:
def fetch_satellite_image(lat, long, zoom=17, size=256):
    url = (
        f"https://api.mapbox.com/styles/v1/mapbox/satellite-v9/static/"
        f"{long},{lat},{zoom}/{size}x{size}"
        f"?access_token={MAPBOX_TOKEN}"
    )
    response = requests.get(url)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    return img

In [None]:
BATCH_SIZE = 32
embeddings = []

batch_imgs = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    img = fetch_satellite_image(row["lat"], row["long"])
    img = transform(img)
    batch_imgs.append(img)

    if len(batch_imgs) == BATCH_SIZE or idx == len(df) - 1:
        batch_tensor = torch.stack(batch_imgs).to(device)

        with torch.no_grad():
            batch_emb = model(batch_tensor).cpu().numpy()

        embeddings.extend(batch_emb)
        batch_imgs = []

100%|██████████| 5404/5404 [27:23<00:00,  3.29it/s]


In [None]:
embeddings = np.array(embeddings)
print(embeddings.shape)   # (N, 512)

emb_df = pd.DataFrame(
    embeddings,
    columns=[f"img_emb_{i}" for i in range(embeddings.shape[1])]
)

final_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

final_df.to_csv("data_with_embeddings.csv", index=False)

(5404, 512)


In [None]:
from google.colab import files
files.download("data_with_embeddings.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>