In [75]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class ApartmentDatasetPyTorch(Dataset):
    def __init__(self, data_dir, images_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        
        self.data_dir = data_dir
        self.images_dir = images_dir
        self.transform = transform
        self.image_paths = []
        self.df = pd.read_csv(data_dir)
        self.df.id = self.df.id.astype(str)
        
        for subdir, dirs, files in os.walk(images_dir):
            for file in files:
                if file.endswith(".jpg") or file.endswith(".JPG") or file.endswith(".jpeg"):
                    img_path = os.path.join(subdir, file)
                    if os.path.getsize(img_path) > 0:
                        self.image_paths.append(img_path)
                    
        self.error_log = {}

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        try:
            image = Image.open(image_path)
        except:
            self.error_log[idx] = "cant load"
            return None
        
        if self.transform:
            image = self.transform(image)
        
        price = self.__get_price_from_image_path(image_path)
        return image, price
    
    def __get_price_from_image_path(self, image_path):
        components = image_path.split("/")
        source = components[2]
        ap_native_id = components[3]
        filtered_rows = self.df[(self.df["source"] == source) & (self.df["id"] == ap_native_id)]
        price = int(filtered_rows["price"])
        return price


# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images to 128x128
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize images
])

# Create the dataset
dataset = ApartmentDatasetPyTorch(
    data_dir = "../apartments.csv",
    images_dir = '../images', 
    transform = transform
)

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Now you can use the dataloader in your training loop
for image, price in dataloader:
    # Your training code here
    print(price)  # Should print torch.Size([32, 3, 128, 128]) for batches of 32 images

tensor([119000, 119000, 119000, 119000, 119000, 119000, 119000, 119000, 119000,
        119000, 119000])


In [113]:
import os
import pandas as pd
import tensorflow as tf
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

import os
import pandas as pd
import tensorflow as tf
from PIL import Image
import numpy as np

class ApartmentDatasetTensorFlow:
    def __init__(self, data_dir, images_dir):
        self.data_dir = data_dir
        self.images_dir = images_dir
        self.image_paths = []
        self.df = pd.read_csv(data_dir)
        self.df.id = self.df.id.astype(str)
        
        for subdir, dirs, files in os.walk(images_dir):
            for file in files:
                if file.endswith(".jpg") or file.endswith(".JPG") or file.endswith(".jpeg"):
                    img_path = os.path.join(subdir, file)
                    if os.path.getsize(img_path) > 0:
                        self.image_paths.append(img_path)
                    
        self.error_log = {}
        self.length = len(self.image_paths)

    def generator(self):
        for img_path in self.image_paths:
            try:
                image = Image.open(img_path)
                image = image.resize((224, 224))
                image = np.array(image, dtype=np.float32) / 255.0
            except Exception as e:
                self.error_log[img_path] = str(e)
                continue
            
            price = self.__get_price_from_image_path(img_path)
            yield image, price
    
    def __get_price_from_image_path(self, image_path):
        components = image_path.split("/")
        source = components[-3]
        ap_native_id = components[-2]
        filtered_rows = self.df[(self.df["source"] == source) & (self.df["id"] == ap_native_id)]
        price = int(filtered_rows.iloc[0]["price"])
        return price

    def get_tf_dataset(self):
        return tf.data.Dataset.from_generator(
            self.generator,
            output_signature=(
                tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.float32)
            )
        )
dataset = ApartmentDatasetTensorFlow(
    data_dir = "../apartments.csv",
    images_dir = '../images'
)

In [117]:
tf_dataset = dataset.get_tf_dataset()
for t1, t2 in tf_dataset.take(1):
    print(t2)


tf.Tensor(119000.0, shape=(), dtype=float32)
