# Loading the Data

In [1]:
import pandas as pd

df = pd.read_csv('Data/socal2.csv')[:7000]
df.head()

Unnamed: 0,image_id,street,citi,n_citi,bed,bath,sqft,price
0,0,1317 Van Buren Avenue,"Salton City, CA",317,3,2.0,1560,201900
1,1,124 C Street W,"Brawley, CA",48,3,2.0,713,228500
2,2,2304 Clark Road,"Imperial, CA",152,3,1.0,800,273950
3,3,755 Brawley Avenue,"Brawley, CA",48,3,1.0,1082,350000
4,4,2207 R Carrillo Court,"Calexico, CA",55,4,3.0,2547,385100


# Creating a Pipeline for Transforming Tabular Data

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

In [9]:
X = df.drop(['price','n_citi','image_id','street'],axis=1)
y = df.price

In [10]:
y

0       201900
1       228500
2       273950
3       350000
4       385100
         ...  
6995    599000
6996    649000
6997    599999
6998    610000
6999    625000
Name: price, Length: 7000, dtype: int64

# Featue Engineering

In [11]:
X['TotalRooms'] = X['bed'] + X['bath']
X['Room_per_Sqft'] = X['TotalRooms']/X['sqft']

In [12]:
X_numerical = X.select_dtypes('number')

In [13]:
X_num = X.select_dtypes(include=["int64", "float64"]).columns
X_cat = X.select_dtypes(include=['object']).columns

In [14]:
numeric_transformer = Pipeline([('imputer',SimpleImputer(strategy='median')),
                                ('scaler',MinMaxScaler())])
categorical_transformer = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                                    ('encoder',OrdinalEncoder()),
                                ('scaler',MinMaxScaler())])



In [15]:
tabular_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_num),
        ('cat', categorical_transformer, X_cat)
    ]
)


In [17]:
X.iloc[1]

citi             Brawley, CA
bed                        3
bath                     2.0
sqft                     713
TotalRooms               5.0
Room_per_Sqft       0.007013
Name: 1, dtype: object

In [20]:
tabular_preprocessor.fit_transform(X)

array([[0.22222222, 0.05555556, 0.14861256, 0.07894737, 0.08599634,
        0.75097276],
       [0.22222222, 0.05555556, 0.05027284, 0.07894737, 0.20460122,
        0.10894942],
       [0.22222222, 0.02777778, 0.06037385, 0.05263158, 0.14190727,
        0.35797665],
       ...,
       [0.33333333, 0.08333333, 0.29292929, 0.13157895, 0.06394798,
        0.72762646],
       [0.33333333, 0.08333333, 0.39881574, 0.13157895, 0.04485058,
        0.72762646],
       [0.22222222, 0.05555556, 0.12771392, 0.07894737, 0.09901907,
        0.20233463]], shape=(7000, 6))

In [23]:
tabular_preprocessor.transform(X.iloc[[1]])

array([[0.22222222, 0.05555556, 0.05027284, 0.07894737, 0.20460122,
        0.10894942]])

# Evaluting XG Boost Regressor Model

### Function for Deriving Image Embeddings

In [10]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model

def extract_image_embeddings(df, image_dir, image_id_col='image_id',
                             image_size=(224, 224), batch_size=32):
    """
    Load images and extract embeddings using frozen ResNet50.

    Parameters:
        df: pandas.DataFrame
            DataFrame containing at least the column with image IDs.
        image_dir: str
            Folder containing images. Assumes filenames are '<image_id>.jpg'.
        image_id_col: str
            Column in df that contains image IDs.
        image_size: tuple
            Target size for images (height, width).
        batch_size: int
            Batch size for CNN predictions.

    Returns:
        img_embeddings: np.ndarray
            Extracted image embeddings (shape: num_samples x 2048)
    """

    # 1️⃣ Load and preprocess images
    image_paths = [os.path.join(image_dir, f"{img_id}.jpg") for img_id in df[image_id_col]]
    X_images = []
    for path in image_paths:
        img = load_img(path, target_size=image_size)
        arr = img_to_array(img) / 255.0  # normalize
        X_images.append(arr)
    X_images = np.array(X_images)

    # 2️⃣ Load frozen ResNet50 and extract embeddings
    base_model = ResNet50(weights='imagenet', include_top=False,
                          input_shape=(image_size[0], image_size[1], 3))
    base_model.trainable = False
    x = GlobalAveragePooling2D()(base_model.output)
    resnet_model = Model(inputs=base_model.input, outputs=x)

    img_embeddings = resnet_model.predict(X_images, batch_size=batch_size, verbose=1)
    return img_embeddings


  if not hasattr(np, "object"):


In [11]:
img_features = extract_image_embeddings(df, image_dir='Data\\socal2\\socal_pics')
print(img_features.shape)  

[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 3s/step
(7000, 2048)


In [12]:
X_tab_preprocessed = tabular_preprocessor.fit_transform(X)
X_combined = np.hstack([X_tab_preprocessed, img_features])

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X_combined, y, test_size=0.2, random_state=42
    )

In [17]:
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(
    X_train, y_train,
    )

y_pred = model.predict(X_test)
print("Evaluation metrics:")
print("R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))




Evaluation metrics:
R²: 0.6074444055557251
MAE: 162561.21875
RMSE: 259590.02754343243


# Saving the model and pipeline

In [27]:
import joblib
import os

os.makedirs('models',exist_ok=True)
os.makedirs('pipelines',exist_ok=True)

# joblib.dump(model,'models/model.joblib')
joblib.dump(tabular_preprocessor,'pipelines/preprocessor.joblib')

['pipelines/preprocessor.joblib']

In [28]:
import joblib
model = joblib.load('models/model.joblib')
pipeline = joblib.load('pipelines/preprocessor.joblib')