In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras import layers, regularizers
import xgboost as xgb


This imports essential libraries for file handling, numerical operations, data manipulation, image processing, deep learning, and model building

In [None]:
# Define paths
csv_file_path = '/content/drive/MyDrive/hackathon/train_data.csv'
test_csv_file_path = '/content/drive/MyDrive/hackathon/test_data_no_target.csv'
submission_template_path = '/content/drive/MyDrive/hackathon/submission_template.csv'
train_img_folder = '/content/drive/MyDrive/images_train/'
test_img_folder = '/content/drive/MyDrive/images_test/'

These paths specify locations for the training and test CSV files, a submission template, and folders containing the training and test images.

In [None]:
# Load training CSV data
train_data = pd.read_csv(csv_file_path)
train_data['ID'] = train_data['ID'].astype(int)

# Load test CSV data (without price)
test_data = pd.read_csv(test_csv_file_path)
test_data['ID'] = test_data['ID'].astype(int)


Here, training and test CSV files are loaded into DataFrames, and the ID column is cast to an integer data type for consistency. When we first started it would convert the ID into a float and not be accessible.

In [None]:
# Normalize the training target prices at the beginning
max_price = train_data['Price'].max()
train_data['Price'] = train_data['Price'] / max_price  # Normalize training target


# Normalize other features in the training and test sets (using the training max)
for feature in ['Bedrooms', 'Bathrooms', 'Area', 'ZipCode']:
    max_val = train_data[feature].max()
    train_data[feature] = train_data[feature] / max_val
    test_data[feature] = test_data[feature] / max_val


This normalizes the Price field in the training set file and scales the other features in both training and test data based on their maximum values in the training set.

In [None]:
# Image size for model input
img_size = (32, 32)
train_image_paths = {}
test_image_paths = {}


# Create dictionaries with image paths for each house ID in training and test sets
def populate_image_paths(data_split, image_folder, image_paths):
    for _, row in data_split.iterrows():
        image_id = int(row['ID'])
        for room in ['front', 'bedroom', 'kitchen', 'bathroom']:
            image_name = f"{image_id}_{room}.jpg"
            full_path = os.path.join(image_folder, image_name)
            if os.path.exists(full_path):
                if image_id not in image_paths:
                    image_paths[image_id] = []
                image_paths[image_id].append(full_path)


populate_image_paths(train_data, train_img_folder, train_image_paths)
populate_image_paths(test_data, test_img_folder, test_image_paths)

This block sets the desired image size for CNN input and initializes dictionaries to store image paths for each ID in the training and test sets. This part was changed multiple times as we realised that having a large image size meant the program would be slower as the computational cost was higher.

In [None]:
# Generate images and labels for train set
def load_image_data(data_split, image_paths):
    image_data, labels = [], []
    for _, row in data_split.iterrows():
        image_id = int(row['ID'])
        if image_id in image_paths:
            img = Image.open(image_paths[image_id][0]).convert('RGB')
            img = img.resize(img_size)
            img_array = np.array(img) / 255.0
            image_data.append(img_array)
            labels.append(row['Price'])
    return np.array(image_data), np.array(labels)


# Load train data
X_train_images, y_train = load_image_data(train_data, train_image_paths)


# Load only images for test data (no labels)
def load_image_data_no_labels(data_split, image_paths):
    image_data, ids = [], []
    for _, row in data_split.iterrows():
        image_id = int(row['ID'])
        if image_id in image_paths:
            img = Image.open(image_paths[image_id][0]).convert('RGB')
            img = img.resize(img_size)
            img_array = np.array(img) / 255.0
            image_data.append(img_array)
            ids.append(image_id)
    return np.array(image_data), ids


# Load test data (only images and IDs)
X_test_images, test_ids = load_image_data_no_labels(test_data, test_image_paths)


This function goes through each entry in the dataset, matches images with room names to IDs, and populates the dictionary with paths for each image if the file exists. It also loads and resizes each image, normalizes pixel values, and stores them along with labels (prices) in separate lists, returning them as NumPy arrays.

In [None]:
# Efficient loading and augmentation using ImageDataGenerator
train_datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True, rotation_range=20)
train_generator = train_datagen.flow(X_train_images, y_train, batch_size=32)


TensorFlow's ImageDataGenerator helps us create augmented versions of our training images by applying transformations like flipping and rotating, making our model more robust. In our project, we used it to generate batches of these varied images during training, which improved our model's ability to generalize and reduced the risk of overfitting.

In [None]:
# Define a custom CNN model architecture suitable for 32x32 input
class SimpleCNN(tf.keras.Model):
    def __init__(self, weight_decay=0.01):
        super(SimpleCNN, self).__init__()
        self.conv1 = layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(weight_decay))
        self.pool1 = layers.MaxPooling2D((2, 2))
        self.conv2 = layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(weight_decay))
        self.pool2 = layers.MaxPooling2D((2, 2))
        self.flatten = layers.Flatten()
        self.dense1 = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(weight_decay))
        self.dropout = layers.Dropout(0.5)
        self.output_layer = layers.Dense(1)  # Output layer with linear activation by default


    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout(x)
        return self.output_layer(x)


This block defines a CNN model class with convolutional, pooling, and dense layers, designed to handle images of size 32x32. We also tried to handle images of size 64x64 but it was too large. Initially, we tried using many layers in our CNN, thinking it would improve accuracy, but that actually hurt our model's performance and led to overfitting. So, we simplified the architecture by reducing the number of layers and added a dropout layer to help regularize it. We chose the ReLU activation function because it outperformed leaky ReLU in our tests; ReLU helps the model learn faster by allowing only positive values to pass through while turning negative inputs into zero, which helps prevent issues like vanishing gradients.

In [None]:
# Custom fit function
def custom_fit(model, train_generator, epochs):
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    model.fit(train_generator, epochs=epochs)


# Build and fit the CNN model
cnn_model = SimpleCNN()
custom_fit(cnn_model, train_generator, epochs=50)

Epoch 1/50


  self._warn_if_super_not_called()


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - loss: 2.6846 - mae: 0.2691
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 1.1874 - mae: 0.0613
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - loss: 0.6512 - mae: 0.0557
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 99ms/step - loss: 0.4466 - mae: 0.0526
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.3492 - mae: 0.0586
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.2811 - mae: 0.0554
Epoch 7/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 0.2353 - mae: 0.0589
Epoch 8/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.1988 - mae: 0.0519
Epoch 9/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - loss: 0.1724

Here, a custom fit function is defined, which compiles and trains the CNN model using mean squared error as the loss function. We wanted to use a root mean sqaured error but it did not really matter at that stage. We initially experimented with the Adam optimizer due to its adaptive learning rate capabilities, which often lead to faster convergence in training neural networks. Afterward, we transitioned to Wadam, an enhancement of Adam that incorporates weight decay, aiming to improve generalization. Ultimately, we found that reverting to the Adam optimizer yielded the best results, likely due to its robust performance across various training scenarios and its ability to handle noisy gradients effectively. We used to use CosineAnnealingScheduler to adjust the learning rate at the beginning of each epoch but using it with adam made the program overcomplicated.

In [None]:
# Extract features from the training images using the CNN
X_train_features = cnn_model.predict(X_train_images)


# Extract features from the test images using the CNN
X_test_features = cnn_model.predict(X_test_images)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


Combining CNN features with other tabular features, the final training and test feature sets for XGBoost are created.

In [None]:
# Train XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train_final, y_train)
# Make predictions on the test set using XGBoost
y_pred_normalized = xgb_model.predict(X_test_final)

This block initializes, trains the XGBoost model, and generates predictions for the test set in normalized form. XGBoost was a strong choice for improving model accuracy because it builds on gradient boosting with techniques that help prevent overfitting, like reducing unnecessary complexity in the model. Before XGBoost, we tried using effiecientNet to boost the model’s accuracy, but it ended up overfitting, meaning it performed well on training data but poorly on new data.

In [None]:
# Convert normalized predictions back to actual prices
y_pred_actual = y_pred_normalized * max_price


# Load the submission template
submission_template = pd.read_csv(submission_template_path)


# Fill in the predicted prices (actual) based on test IDs
for idx, id_val in enumerate(test_ids):
    submission_template.loc[submission_template['ID'] == id_val, 'Predicted_Price'] = y_pred_actual[idx]


# Save predictions to the submission template
submission_template.to_csv(submission_template_path, index=False)


print("Predictions saved to submission_template.csv.")


Predictions saved to submission_template.csv.


This block converts normalized predictions back to actual prices by multiplying each prediction by max_price, yielding y_pred_actual. It then updates submission_template.csv with these actual price predictions for each test ID, saving the final file for submission and confirming completion.