In [1]:
import os
import pandas as pd
import tensorflow as tf

In [3]:
os.chdir("C:\\Users\\yana1\\Downloads")

In [5]:
data = pd.read_csv('thesisdata_cleaned_fullsample1.csv')

data = data.drop(columns = ['Unnamed: 0'])
data.head()

Unnamed: 0,price_numeric,Height,Width,Uniqueness,gender_guessed,gallery,painting,is_signed,age,years_selling,location,image_path,url
0,3158.0,27.6,27.6,Unique,female,independent,other,signed,0.0,1.0,France,images_thesis\2313827_1_m.jpg,https://www.artsper.com/us/contemporary-artwor...
1,2065.0,19.7,19.7,Unique,female,independent,other,signed,0.0,3.0,France,images_thesis\2328028_1_m.jpg,https://www.artsper.com/us/contemporary-artwor...
2,3522.0,23.6,23.6,Unique,female,independent,other,signed,0.0,3.0,France,images_thesis\2299335_1_m.jpg,https://www.artsper.com/us/contemporary-artwor...
3,644.0,15.7,11.8,Unique,female,gallery,oil,signed,4.0,5.0,other,images_thesis\1118562_1_m.jpg,https://www.artsper.com/us/contemporary-artwor...
4,838.0,15.7,15.7,Unique,unknown,independent,other,signed,4.0,5.0,France,images_thesis\1241121_1_m.jpg,https://www.artsper.com/us/contemporary-artwor...


In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# ===============================
# 1) Copy & remove UNUSED columns (BUT KEEP image_path!)
# ===============================
df = data.copy()
df = df.drop(columns=["url"])    # DO NOT drop image_path

target = "price_numeric"
numeric = ["Height", "Width", "age", "years_selling"]
categorical = ["Uniqueness", "gender_guessed", "gallery", "painting", "is_signed", "location"]

# ===============================
# 2) Initial Train/Test split FIRST
# ===============================
df_train, df_test = train_test_split(df, test_size=0.15, random_state=123)

# ===============================
# 3) Compute Tukey limits using TRAIN ONLY
# ===============================
Q1 = df_train[target].quantile(0.25)
Q3 = df_train[target].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# ===============================
# 4) Remove outliers from TRAIN ONLY
# ===============================
df_train = df_train[(df_train[target] >= lower) & (df_train[target] <= upper)]

# ===============================
# 5) Remove test outliers using TRAIN thresholds (NO leakage)
# ===============================
df_test = df_test[(df_test[target] >= lower) & (df_test[target] <= upper)]

# ===============================
# 6) Train/Validation split (AFTER cleaning)
# ===============================
df_train, df_val = train_test_split(df_train, test_size=0.1765, random_state=123)

# ===============================
# 7) EXTRACT raw X/y FOR MLP (image_path EXCLUDED)
# ===============================
X_train_raw = df_train[numeric + categorical]   # image_path NOT included
y_train = df_train[target]

X_val_raw = df_val[numeric + categorical]
y_val = df_val[target]

X_test_raw = df_test[numeric + categorical]
y_test = df_test[target]

# ===============================
# 8) One-hot encoding (FIT ON TRAIN ONLY)
# ===============================
X_train = pd.get_dummies(X_train_raw, columns=categorical, drop_first=True)
X_val   = pd.get_dummies(X_val_raw,   columns=categorical, drop_first=True)
X_test  = pd.get_dummies(X_test_raw,  columns=categorical, drop_first=True)

# Align columns
X_val  = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# ===============================
# 9) Scale numeric features using RobustScaler (FIT ON TRAIN ONLY)
# ===============================
scaler = RobustScaler()
X_train[numeric] = scaler.fit_transform(X_train[numeric])
X_val[numeric]   = scaler.transform(X_val[numeric])
X_test[numeric]  = scaler.transform(X_test[numeric])

# ===============================
# Final shapes for MLP
# ===============================
print("Train shape:", X_train.shape)
print("Val shape:",   X_val.shape)
print("Test shape:",  X_test.shape)

# ===============================
# SAVE CNN DATASETS (WITH image_path)
# ===============================
df_train_cnn = df_train[['image_path', target]].copy()
df_val_cnn   = df_val[['image_path', target]].copy()
df_test_cnn  = df_test[['image_path', target]].copy()

print("CNN Train:", df_train_cnn.shape)
print("CNN Val:",   df_val_cnn.shape)
print("CNN Test:",  df_test_cnn.shape)


Train shape: (13404, 15)
Val shape: (2873, 15)
Test shape: (2855, 15)
CNN Train: (13404, 2)
CNN Val: (2873, 2)
CNN Test: (2855, 2)


In [25]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.Input(shape=(X_train.shape[1],)),

    # Block 1
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),

    # Block 2
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.BatchNormalization(),

    # Block 3
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.BatchNormalization(),

    # Output layer
    tf.keras.layers.Dense(1)
])

In [27]:
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-4),
    loss=tf.keras.losses.Huber(),
    metrics=["mae"]
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=20,
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5,
    min_lr=1e-5
)

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=300,
    batch_size=128,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


Epoch 1/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1845.1415 - mae: 1845.6415 - val_loss: 1816.2614 - val_mae: 1816.7614 - learning_rate: 0.0010
Epoch 2/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1841.1925 - mae: 1841.6925 - val_loss: 1811.3721 - val_mae: 1811.8721 - learning_rate: 0.0010
Epoch 3/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1833.6604 - mae: 1834.1604 - val_loss: 1802.1636 - val_mae: 1802.6636 - learning_rate: 0.0010
Epoch 4/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1822.4445 - mae: 1822.9445 - val_loss: 1789.1063 - val_mae: 1789.6063 - learning_rate: 0.0010
Epoch 5/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1807.6182 - mae: 1808.1182 - val_loss: 1771.6996 - val_mae: 1772.1996 - learning_rate: 0.0010
Epoch 6/300
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict on test set
y_pred = model.predict(X_test).ravel()

# Compute metrics
mse  = mean_squared_error(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse_ratio = rmse / y_test.mean()

print("Test MSE:", mse)
print("Test MAE:", mae)
print("Test RMSE:", rmse)
print("RMSE Ratio:", rmse_ratio)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test MSE: 939643.9152067989
Test MAE: 605.347353463248
Test RMSE: 969.3523173783611
RMSE Ratio: 0.5187200946226167


In [33]:
#### IMAGE LOADER

import tensorflow as tf
from tensorflow.keras.applications.efficientnet import preprocess_input

IMG_SIZE = 224
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

def load_image(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = preprocess_input(img)   # EfficientNet preprocessing (required)
    return img, label

train_ds = (
    tf.data.Dataset.from_tensor_slices((df_train_cnn["image_path"], df_train_cnn["price_numeric"]))
    .map(load_image, num_parallel_calls=AUTOTUNE)
    .shuffle(1024)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

val_ds = (
    tf.data.Dataset.from_tensor_slices((df_val_cnn["image_path"], df_val_cnn["price_numeric"]))
    .map(load_image, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

test_ds = (
    tf.data.Dataset.from_tensor_slices((df_test_cnn["image_path"], df_test_cnn["price_numeric"]))
    .map(load_image, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)


In [35]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models

base = EfficientNetB0(
    weights="imagenet",
    include_top=False,
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

base.trainable = False   # Freeze EfficientNet base

inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = base(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)

x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.3)(x)

x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.2)(x)

outputs = layers.Dense(1)(x)

model = models.Model(inputs, outputs)
model.summary()


In [37]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="mae",
    metrics=["mae"]
)
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    verbose=1
)


Epoch 1/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 844ms/step - loss: 1668.4501 - mae: 1668.4501 - val_loss: 1257.8141 - val_mae: 1257.8141
Epoch 2/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 773ms/step - loss: 1159.5308 - mae: 1159.5308 - val_loss: 1079.5375 - val_mae: 1079.5375
Epoch 3/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 765ms/step - loss: 1101.0612 - mae: 1101.0612 - val_loss: 1060.6377 - val_mae: 1060.6377
Epoch 4/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 763ms/step - loss: 1082.2972 - mae: 1082.2972 - val_loss: 1047.2738 - val_mae: 1047.2738
Epoch 5/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 765ms/step - loss: 1069.6781 - mae: 1069.6781 - val_loss: 1036.9413 - val_mae: 1036.9413
Epoch 6/20
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 765ms/step - loss: 1059.9360 - mae: 1059.9360 - val_loss: 1027.9498 - val_mae: 10

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Predictions
y_pred = model.predict(test_ds).ravel()

# True values
y_true = df_test_cnn["price_numeric"].values

# Metrics
mse  = mean_squared_error(y_true, y_pred)
mae  = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mse)
rmse_ratio = rmse / y_true.mean()

print("Test MSE:", mse)
print("Test MAE:", mae)
print("Test RMSE:", rmse)
print("RMSE Ratio:", rmse_ratio)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 705ms/step
Test MSE: 1961648.8768552984
Test MAE: 998.8659815372393
Test RMSE: 1400.588760791439
RMSE Ratio: 0.7494834659187525


In [45]:
from tensorflow.keras.models import load_model
model.save("CNN.h5") 




In [47]:
model.summary()