In [1]:
import pandas as pd

df = pd.read_csv('Data/socal2.csv')[:5000]
df.head()

Unnamed: 0,image_id,street,citi,n_citi,bed,bath,sqft,price
0,0,1317 Van Buren Avenue,"Salton City, CA",317,3,2.0,1560,201900
1,1,124 C Street W,"Brawley, CA",48,3,2.0,713,228500
2,2,2304 Clark Road,"Imperial, CA",152,3,1.0,800,273950
3,3,755 Brawley Avenue,"Brawley, CA",48,3,1.0,1082,350000
4,4,2207 R Carrillo Court,"Calexico, CA",55,4,3.0,2547,385100


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [3]:
X = df.drop(['price','n_citi','image_id'],axis=1)
y = df.price

In [4]:
X_num = X.select_dtypes(include=["int64", "float64"]).columns
X_cat = X.select_dtypes(include=['object']).columns

In [5]:
numeric_transformer = Pipeline([('imputer',SimpleImputer(strategy='median')),
                                ('scaler',StandardScaler())])
categorical_transformer = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                                    ('encoder',OrdinalEncoder()),
                                ('scaler',StandardScaler())])



In [6]:
tabular_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_num),
        ('cat', categorical_transformer, X_cat)
    ]
)

X_preprocessed = tabular_preprocessor.fit_transform(X)
X_preprocessed

array([[-0.34842841, -0.24763986, -0.3417661 , -1.26548362,  0.92202006],
       [-0.34842841, -0.24763986, -1.33017222, -1.36504042, -1.55839367],
       [-0.34842841, -1.22614411, -1.22864762, -0.29714487, -0.6366183 ],
       ...,
       [-0.34842841,  1.80721908,  1.33980795,  0.97582066, -0.36846546],
       [ 0.66680589,  0.82871482,  3.74722451,  1.30767664,  0.73766499],
       [ 0.66680589,  0.7308644 ,  1.37831728,  0.47633485,  1.42480663]],
      shape=(5000, 5))

In [7]:
from tensorflow.keras.layers import Input, Dense, Concatenate, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50

# Tabular branch
num_tabular_features = X_preprocessed.shape[1]
tabular_input = Input(shape=(num_tabular_features,), name="tabular_input")
x_tab = Dense(128, activation='relu')(tabular_input)
x_tab = Dense(64, activation='relu')(x_tab)

# Image branch
image_input = Input(shape=(224,224,3), name="image_input")
resnet_base = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)
resnet_base.trainable = False   # IMPORTANT

x_img = GlobalAveragePooling2D()(resnet_base.output)
x_img = Dense(256, activation='relu')(x_img)

# Fusion
combined = Concatenate()([x_tab, x_img])
z = Dense(128, activation='relu')(combined)
z = Dense(64, activation='relu')(z)
output = Dense(1, activation='linear', name='price')(z)

model = Model(inputs=[tabular_input, image_input], outputs=output)
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

model.summary()


  if not hasattr(np, "object"):


In [8]:
from tensorflow.keras.applications.resnet50 import preprocess_input
import tensorflow as tf
import numpy as np

def load_images(image_ids, folder_path="Data/socal2/socal_pics"):
    """
    Loads and preprocesses images for ResNet50.

    Args:
        image_ids (list): List of image IDs or filenames without extension.
        folder_path (str): Base folder where images are stored.

    Returns:
        np.array: Preprocessed images ready for ResNet50, shape (N, 224, 224, 3)
    """
    imgs = []
    for img_id in image_ids:
        path = f"{folder_path}/{img_id}.jpg"  # adjust extension if needed
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)      # decode JPEG
        img = tf.image.resize(img, (224, 224))           # resize to ResNet input
        img = preprocess_input(img)                      # ResNet preprocessing
        imgs.append(img)
    
    return np.array(imgs)


In [9]:
X_tab = X_preprocessed
X_img = load_images(df['image_id'].tolist())
y = df['price'].values


In [10]:
print(X_tab.shape)   # (N, num_features)
print(X_img.shape)   # (N, 224, 224, 3)
print(y.shape)       # (N,)


(5000, 5)
(5000, 224, 224, 3)
(5000,)


In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    filepath='models/best_model.keras',
    monitor='val_loss',
    save_best_only=True
)


In [12]:
history = model.fit(
    [X_tab, X_img],
    y,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    callbacks=[early_stop, model_checkpoint]
)


Epoch 1/50
[1m 34/250[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m7:58[0m 2s/step - loss: 566250075316.7059 - mae: 655081.1636

KeyboardInterrupt: 