<a href="https://colab.research.google.com/github/TATENDA-MAX/HousePredictionModels/blob/main/HousePredictionModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models, applications, Input, optimizers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os

In [50]:
# ==========================================
# 1. CONFIGURATION
# ==========================================
IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 32
EPOCHS = 10  # Adjust based on your time constraints
IMAGE_DIR = '/content/drive/MyDrive/images/'  # <--- UPDATE THIS PATH
CSV_FILE = '/content/drive/MyDrive/property_listings_complete.csv'

In [51]:
df = pd.read_csv(r"/content/drive/MyDrive/property_listings_complete.csv")

# 1. Check for datatypes and total entries (look for nonr null counts)
print(df.info())

# 2. Check for missing values
print(df.isnull().sum())

# 3. List all column names to plan your inputs (features) vs outputs (labels)
print(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1613 entries, 0 to 1612
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   scraped_page     1613 non-null   int64  
 1   title            1612 non-null   object 
 2   detail_url       1613 non-null   object 
 3   currency         1610 non-null   object 
 4   price            1610 non-null   float64
 5   building_area    1184 non-null   float64
 6   building_unit    1613 non-null   object 
 7   land_area        1536 non-null   float64
 8   land_unit        1613 non-null   object 
 9   property_type    852 non-null    object 
 10  bedrooms         993 non-null    float64
 11  bathrooms        192 non-null    float64
 12  location         1369 non-null   object 
 13  image_count      1613 non-null   int64  
 14  image_filenames  1594 non-null   object 
dtypes: float64(5), int64(2), object(8)
memory usage: 189.2+ KB
None
scraped_page          0
title             

In [52]:
#1 CLEAN TARGET: Drop rows where Price or Currency is missing
df = df.dropna(subset=['price', 'currency', 'image_filenames'])

# DROP NOISY COLUMNS: Bathrooms has too much missing data
cols_to_drop = ['bathrooms', 'scraped_page', 'detail_url']
df = df.drop(columns =  cols_to_drop)

# 3. FILL missing property_type with "Unknown"
df['property_type'] = df['property_type'].fillna("Unknown")

# 4. Fill missing areas with 0 for now (assuming if it's blank, it's not listed)
df['building_area'] = df['building_area'].fillna(0)
df['land_area'] = df['land_area'].fillna(0)

# Verify the cleaning
print("Missing values after cleaning:")
print(df.isnull().sum())
print(f"\nRemaining data shape: {df.shape}")

Missing values after cleaning:
title                0
currency             0
price                0
building_area        0
building_unit        0
land_area            0
land_unit            0
property_type        0
bedrooms           610
location           241
image_count          0
image_filenames      0
dtype: int64

Remaining data shape: (1591, 12)


In [53]:
both_missing_mask = df['bedrooms'].isna() & df['location'].isna()

df = df[~both_missing_mask]
print(df.shape)
print("rows dropped successfully")

(1454, 12)
rows dropped successfully


In [54]:
print(df['currency'].value_counts())
print(df['building_unit'].value_counts())
print(df['land_unit'].value_counts())

currency
USD    1454
Name: count, dtype: int64
building_unit
m²    1454
Name: count, dtype: int64
land_unit
m²    1454
Name: count, dtype: int64


In [55]:
# Safety check to confirm single value columns
columns_to_check = ['currency', 'building_unit', 'land_unit']
df = df.drop(columns = columns_to_check)

In [56]:
remaining_columns = df.columns

print(f"The remaining columns are: {len(remaining_columns)}")

# 1. Check for datatypes and total entries (look for nonr null counts)
print(df.info())

# 2. Check for missing values
print(df.isnull().sum())

# 3. List all column names to plan your inputs (features) vs outputs (labels)
print(df.columns)

The remaining columns are: 9
<class 'pandas.core.frame.DataFrame'>
Index: 1454 entries, 0 to 1611
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1454 non-null   object 
 1   price            1454 non-null   float64
 2   building_area    1454 non-null   float64
 3   land_area        1454 non-null   float64
 4   property_type    1454 non-null   object 
 5   bedrooms         981 non-null    float64
 6   location         1350 non-null   object 
 7   image_count      1454 non-null   int64  
 8   image_filenames  1454 non-null   object 
dtypes: float64(4), int64(1), object(4)
memory usage: 113.6+ KB
None
title                0
price                0
building_area        0
land_area            0
property_type        0
bedrooms           473
location           104
image_count          0
image_filenames      0
dtype: int64
Index(['title', 'price', 'building_area', 'land_area', 'property_type',
   

In [57]:
df = df.dropna(subset = ['location'])
median_value = df['bedrooms'].median()
df['bedrooms'] = df['bedrooms'].fillna(median_value)

In [58]:
print(df.isna().sum())
print(df.shape)

df['bedrooms'] = df['bedrooms'].astype(int)

print(df.info())

title              0
price              0
building_area      0
land_area          0
property_type      0
bedrooms           0
location           0
image_count        0
image_filenames    0
dtype: int64
(1350, 9)
<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 0 to 1611
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1350 non-null   object 
 1   price            1350 non-null   float64
 2   building_area    1350 non-null   float64
 3   land_area        1350 non-null   float64
 4   property_type    1350 non-null   object 
 5   bedrooms         1350 non-null   int64  
 6   location         1350 non-null   object 
 7   image_count      1350 non-null   int64  
 8   image_filenames  1350 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 105.5+ KB
None


In [59]:
# Checking the number of unique values in the categorical variables

print("Unique Locations:", df['location'].nunique())
print("Unique Property Types:", df['property_type'].nunique())

Unique Locations: 36
Unique Property Types: 58


In [60]:
# Dimensionality Reduction by grouping the rare categories into a generic "Other" Label

# Function to group rare categories
print(df['location'].value_counts())

location
Harare            758
Bulawayo          149
Borrowdale         80
Greendale          52
Marlborough        42
Mount Pleasant     41
Waterfalls         29
Eastlea            19
Avondale           18
Hatfield           17
Mt Pleasant        16
Msasa              16
Greystone Park     13
Newlands           13
Hillside           12
Belvedere          11
Highlands          11
Milton Park         7
The Grange          6
Alexandra Park      6
Warren Park         5
Glen View           4
Kuwadzana           4
harare              3
Mufakose            2
Nkulumane           2
low density         2
EASTLEA             2
HARARE              2
High Density        2
Khumalo             1
GREYSTONE PARK      1
BORROWDALE          1
waterfalls          1
greendale           1
High density        1
Name: count, dtype: int64


In [61]:
print(df['property_type'].value_counts())

property_type
Unknown                589
4 Bedroom House        180
3 Bedroom House        142
5 Bedroom House        104
3 Bedroom Flat          52
6 Bedroom House         50
3 Bedroom Townhouse     48
4 Bedroom Townhouse     36
2 Bedroom Flat          36
7 Bedroom House         18
8 Bedroom House          9
2 Bedroom House          7
4 Bedroom Flat           6
5 Bedroom Townhouse      4
12 Bedroom House         4
6 Bedroom Townhouse      4
3 bed house              4
3 bedroom townhouse      3
10 Bedroom House         3
11 Bedroom House         3
2 bed flat               3
2 bed cottage            2
9 Bedroom House          2
3 bedroom duplex         2
2bed duplex              2
4 bed house              2
3 Bed House              2
1 bed cottage            2
7 Bedroom Townhouse      2
3 bedroom house          1
6 bedroom house          1
4 bed duplex             1
2 Bed Apartment          1
3 Bed Apartment          1
5 bedroom house          1
2 Bed Cottage            1
8 Bedroom Town

In [62]:
# Function to group rare categories
def group_rare_categories(df, column, threshold=20):
    # 1. Count the frequency of each category
    counts = df[column].value_counts()

    # 2. Find the valid categories (those that appear more than threshold times)
    valid_categories =  counts[counts > threshold].index

    # Apply the mask: If it's not in valid_categories, name it Other
    df[column] = df[column].apply(lambda x: x if x in valid_categories else 'Other')
    return df

# APPLY the strategy

df = group_rare_categories(df, 'property_type', threshold = 5)
df = group_rare_categories(df, 'location', threshold = 18)

# --- VERIFY THE RESULTS ---
print("New Unique Locations:", df['location'].nunique())
print("New Unique Property Types:", df['property_type'].nunique())

print("\nTop Property Types now:")
print(df['property_type'].value_counts())

New Unique Locations: 9
New Unique Property Types: 14

Top Property Types now:
property_type
Unknown                589
4 Bedroom House        180
3 Bedroom House        142
5 Bedroom House        104
Other                   73
3 Bedroom Flat          52
6 Bedroom House         50
3 Bedroom Townhouse     48
2 Bedroom Flat          36
4 Bedroom Townhouse     36
7 Bedroom House         18
8 Bedroom House          9
2 Bedroom House          7
4 Bedroom Flat           6
Name: count, dtype: int64


In [63]:
# encoding: CONVERT 'LOCATION' AND 'PROPERTY_TYPE' TO NUMBERS
df = pd.get_dummies(df, columns = ['location', 'property_type'], drop_first=True)
df.head(5)

Unnamed: 0,title,price,building_area,land_area,bedrooms,image_count,image_filenames,location_Bulawayo,location_Eastlea,location_Greendale,...,property_type_3 Bedroom Townhouse,property_type_4 Bedroom Flat,property_type_4 Bedroom House,property_type_4 Bedroom Townhouse,property_type_5 Bedroom House,property_type_6 Bedroom House,property_type_7 Bedroom House,property_type_8 Bedroom House,property_type_Other,property_type_Unknown
0,Developers Dream,190000.0,180.0,1352.0,3,1,df03d95d0b_0.webp,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2 Bedroom Flat In Prime Avondale Location,95000.0,120.0,0.0,2,1,41c6aa94bc_0.webp,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,The Strand office land in Borrowdale.,875000.0,0.0,8000.0,4,1,ac7160491a_0.webp,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,Stands for Sale,60000.0,442.0,442.0,4,1,1d7f22fc05_0.webp,False,False,False,...,False,False,False,False,False,False,False,False,False,True
6,Finish It To Your Taste....,180000.0,1000.0,1736.0,6,1,9255ebae71_0.webp,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [64]:
# ==========================================
# 2. DATA PREPARATION
# ==========================================
def load_and_preprocess_data(csv_path):
    df = pd.read_csv(csv_path)

    # Filter for valid target and images
    df = df[df['price'].notna()]
    df = df[df['image_filenames'].notna()]
    df = df[df['currency'] == 'USD'] # Ensure consistent currency

    # Select Features
    numeric_features = ['bedrooms', 'bathrooms', 'land_area', 'building_area']
    categorical_features = ['location', 'property_type']

    # Preprocessing Pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit preprocessing
    X_processed = preprocessor.fit_transform(df)
    y = df['price'].values
    filenames = df['image_filenames'].values

    return X_processed, y, filenames, preprocessor

In [65]:
# ==========================================
# 3. DATA GENERATOR (Dual Input)
# ==========================================
class HousePriceGenerator(tf.keras.utils.Sequence):
    def __init__(self, filenames, tabular_data, prices, batch_size, img_dir):
        super().__init__() # <--- FIX 1: Call super init
        self.filenames = filenames
        self.tabular_data = tabular_data
        self.prices = prices
        self.batch_size = batch_size
        self.img_dir = img_dir
        self.indexes = np.arange(len(self.filenames))

    def __len__(self):
        return int(np.floor(len(self.filenames) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        batch_imgs = []
        batch_tab = []
        batch_y = []

        for i in indexes:
            # Handle cases where filename might be "img1.jpg, img2.jpg"
            img_name = str(self.filenames[i]).split(',')[0].strip()
            img_path = os.path.join(self.img_dir, img_name)

            try:
                # Load and Resize
                img = load_img(img_path, target_size=(IMG_HEIGHT, IMG_WIDTH))
                img = img_to_array(img) / 255.0
            except Exception:
                # Fallback: Black image if file missing
                img = np.zeros((IMG_HEIGHT, IMG_WIDTH, 3))

            batch_imgs.append(img)
            batch_tab.append(self.tabular_data[i])
            batch_y.append(self.prices[i])

        # <--- FIX 2: Return a TUPLE (imgs, tab) instead of LIST [imgs, tab]
        return (np.array(batch_imgs), np.array(batch_tab)), np.array(batch_y)

In [66]:
# ==========================================
# 4. CUSTOM BLOCKS & LAYERS
# ==========================================
def squeeze_excite_block(input_tensor, ratio=16):
    init = input_tensor
    filters = init.shape[-1]
    se_shape = (1, 1, filters)

    se = layers.GlobalAveragePooling2D()(init)
    se = layers.Reshape(se_shape)(se)
    se = layers.Dense(filters // ratio, activation='relu')(se)
    se = layers.Dense(filters, activation='sigmoid')(se)

    return layers.Multiply()([init, se])

def highway_layer(x, units=None, activation='relu'):
    if units is None: units = x.shape[-1]
    h = layers.Dense(units, activation=activation)(x)
    t = layers.Dense(units, activation='sigmoid')(x)
    return layers.Add()([layers.Multiply()([h, t]), layers.Multiply()([x, 1 - t])])

In [67]:
# ==========================================
# 5. MODEL BUILDER FACTORY
# ==========================================
def build_multi_modal_model(model_name, tab_shape):
    # --- IMAGE INPUT ---
    img_input = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))

    # --- CNN BACKBONE SELECTION ---
    if model_name == 'AlexNet':
        x = layers.Conv2D(96, 11, strides=4, activation='relu')(img_input)
        x = layers.MaxPooling2D(3, strides=2)(x)
        x = layers.Conv2D(256, 5, padding='same', activation='relu')(x)
        x = layers.MaxPooling2D(3, strides=2)(x)
        x = layers.Conv2D(384, 3, padding='same', activation='relu')(x)
        x = layers.Flatten()(x)

    elif model_name == 'NiN':
        x = layers.Conv2D(192, 5, padding='same', activation='relu')(img_input)
        x = layers.Conv2D(160, 1, activation='relu')(x)
        x = layers.Conv2D(96, 1, activation='relu')(x)
        x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
        x = layers.GlobalAveragePooling2D()(x)

    elif model_name == 'VGG':
        base = applications.VGG16(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'ResNet':
        base = applications.ResNet50(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'Inception-V3':
        base = applications.InceptionV3(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'Inception-ResNet-v2':
        base = applications.InceptionResNetV2(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'Xception':
        base = applications.Xception(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'DenseNet':
        base = applications.DenseNet121(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'MobileNet-V2':
        base = applications.MobileNetV2(include_top=False, input_tensor=img_input)
        x = layers.GlobalAveragePooling2D()(base.output)

    elif model_name == 'Squeeze-and-Excitation':
        # Custom simple SE-Net
        x = layers.Conv2D(32, 3, activation='relu')(img_input)
        x = squeeze_excite_block(x)
        x = layers.Conv2D(64, 3, activation='relu')(x)
        x = squeeze_excite_block(x)
        x = layers.GlobalAveragePooling2D()(x)

    elif model_name == 'Highway':
        x = layers.Flatten()(img_input)
        x = layers.Dense(128)(x)
        x = highway_layer(x, 128)

    elif model_name == 'WideResNet':
        # Simplified WideResNet Block
        x = layers.Conv2D(64, 3, padding='same', activation='relu')(img_input)
        x = layers.Conv2D(64, 3, padding='same', activation='relu')(x) # Wide layers
        x = layers.GlobalAveragePooling2D()(x)

    elif model_name == 'CapsuleNet':
        # Simplified Representation (Vector features)
        x = layers.Conv2D(256, 9, activation='relu')(img_input)
        x = layers.Conv2D(32 * 8, 9, strides=2, padding='valid', activation='relu')(x)
        x = layers.Reshape((-1, 8))(x) # "Capsules"
        x = layers.GlobalAveragePooling1D()(x)

    # ... [Add other placeholders for complex research architectures like FractalNet] ...
    else:
        # Default/Fallback to a standard CNN for undefined names
        x = layers.Conv2D(32, 3, activation='relu')(img_input)
        x = layers.MaxPooling2D()(x)
        x = layers.Conv2D(64, 3, activation='relu')(x)
        x = layers.GlobalAveragePooling2D()(x)

    # --- TABULAR INPUT ---
    tab_input = Input(shape=(tab_shape,))
    y = layers.Dense(16, activation="relu")(tab_input)

    # --- FUSION ---
    combined = layers.Concatenate()([x, y])
    z = layers.Dense(128, activation="relu")(combined)
    z = layers.Dense(64, activation="relu")(z)
    final_output = layers.Dense(1, activation="linear")(z) # Linear for Price

    model = models.Model(inputs=[img_input, tab_input], outputs=final_output)
    return model

In [None]:
# ==========================================
# 6. MAIN EXECUTION
# ==========================================
def run_comparison():
    # Load Data
    print("Loading Data...")
    X_tab, y, filenames, _ = load_and_preprocess_data(CSV_FILE)

    # Split
    X_train_tab, X_val_tab, y_train, y_val, f_train, f_val = train_test_split(
        X_tab, y, filenames, test_size=0.2, random_state=42
    )

    # Generators
    train_gen = HousePriceGenerator(f_train, X_train_tab, y_train, BATCH_SIZE, IMAGE_DIR)
    val_gen = HousePriceGenerator(f_val, X_val_tab, y_val, BATCH_SIZE, IMAGE_DIR)

    # Model List
    model_names = [
        'AlexNet', 'NiN', 'ZfNet', 'VGG', 'GoogleNet', 'Inception-V3',
        'Highway', 'Inception-V4', 'ResNet', 'Inception-ResNet-v2',
        'FractalNet', 'WideResNet', 'Xception', 'Residual Attention Neural Network',
        'Squeeze-and-Excitation', 'DenseNet', 'Competitive Squeeze and Excitation',
        'MobileNet-V2', 'CapsuleNet', 'HRNetV2', 'ResNeXt'
    ]

    results = {}

    for name in model_names:
        print(f"\nTraining Model: {name}")
        try:
            # Note: For complex names not in `if/else` above, it defaults to Basic CNN
            # You can extend the `build_multi_modal_model` function for all 21 specific implementations
            model = build_multi_modal_model(name, X_tab.shape[1])

            model.compile(optimizer='adam', loss='mean_absolute_error')

            history = model.fit(
                train_gen,
                validation_data=val_gen,
                epochs=EPOCHS,
                verbose=1
            )

            best_loss = min(history.history['val_loss'])
            results[name] = best_loss
            print(f"Model {name} Best Validation MAE: {best_loss}")

        except Exception as e:
            print(f"Failed to train {name}: {str(e)}")
            results[name] = float('inf')

    # Compare
    print("\nFINAL RESULTS (Best MAE - Lower is Better):")
    for name, score in sorted(results.items(), key=lambda item: item[1]):
        print(f"{name}: {score}")

if __name__ == "__main__":
    # Ensure you set IMAGE_DIR at the top before running
    run_comparison()

Loading Data...

Training Model: AlexNet
Epoch 1/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 5s/step - loss: 3005445376.0000 - val_loss: 522772.2812
Epoch 2/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 3s/step - loss: 3615458816.0000 - val_loss: 485118.1250
Epoch 3/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 3s/step - loss: 47432769536.0000 - val_loss: 472423.6250
Epoch 4/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 3s/step - loss: 16785827840.0000 - val_loss: 481902.1562
Epoch 5/10
[1m32/39[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m21s[0m 3s/step - loss: 5568451072.0000