# Project Description

The project aims to predict housing listing prices using data gathered from a housing sales website, which includes details about different house features. 

The process begins with feature engineering to extract both numerical and categorical data. Next, text data is embedded using an LLM model from Hugging Face. Finally, the datasets are merged, and XGBoost and Random Forest models are trained to predict listing prices. The XGBoost model demonstrates superior performance compared to the Random Forest model.

Importing Libraries

In [6]:
import numpy as np
import pandas as pd
import pickle

import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# Data Cleaning and Feature Engineering

In [7]:
with open('data/train.pickle', 'rb') as f:
    dataset = pickle.load(f)
with open("data/test.pickle","rb") as f:
    test_data = pickle.load(f)

In [8]:
dataset[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

In [11]:
# look at locations
l = [item.get('loc_string') for item in dataset]
type = [item.get('type') for item in dataset]
selltype = [item.get('selltype') for item in dataset]
data_dict = {'location': l, 'type': type, 'selltype':selltype}
df = pd.DataFrame(data_dict)
df['location'].unique()

array(['Barcelona - Sant Antoni', 'Barcelona - Dreta de l´Eixample',
       'Barcelona - Sagrada Família', 'Barcelona - Fort Pienc',
       'Barcelona - L´Antiga Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - Poblenou',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - Poblenou\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot',
       'Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Provençals del Poblenou', 'Barcelona - El Clot',
       'Barcelona - Navas'], dtype=object)

In [12]:
df['type'].unique(), df['selltype'].unique()
# given that "selltype" only has one value, there's no necessity to incorporate it as a feature.

(array(['FLAT', 'STUDIO', 'GROUND_FLOOR', 'PENTHOUSE', 'APARTMENT', 'LOFT',
        'DUPLEX'], dtype=object),
 array(['SECOND_HAND'], dtype=object))

In [5]:
def process_data(data, include_price=True):
    location = []
    price = []
    size = []  # Size in m2
    bedrooms = []  # Number of bedrooms
    bathrooms = []  # Number of bathrooms
    types = []

    # Iterate through each entry in the data
    for diction in data:
        location.append(diction['loc_string'].replace('Barcelona - ','').replace('\nVer mapa', ''))
        if include_price:
            price.append(float(diction['price'].replace('€', '').replace('.', ''))/1000)
        size.append(float(diction['features'][0].replace('m2', '')))

        # Initialize default values for features
        num_bedrooms = 0
        num_bathrooms = 0

        # Extract features
        for feature in diction['features'][1:]:
            if 'hab.' in feature:
                num_bedrooms = int(feature.replace('hab.', ''))
            elif 'baño' in feature:
                num_bathrooms = int(feature.split()[0])

        # Append processed features
        bedrooms.append(num_bedrooms)
        bathrooms.append(num_bathrooms)
        types.append(diction['type'])

    # Create a DataFrame from the lists
    data_dict = {
        'Location': location,
        'Size': size,
        'Bedrooms': bedrooms,
        'Bathrooms': bathrooms,
        'Type': types,
    }
    if include_price:
        data_dict['Price'] = price

    df = pd.DataFrame(data_dict)

    return df

In [14]:
train_clean = process_data(dataset)
test_clean = process_data(test_data, include_price=False)

In [15]:
train_clean.head()

Unnamed: 0,Location,Size,Bedrooms,Bathrooms,Type,Price
0,Sant Antoni,85.0,2,1,FLAT,320.0
1,Dreta de l´Eixample,65.0,2,1,FLAT,335.0
2,Dreta de l´Eixample,77.0,2,1,FLAT,330.0
3,Sant Antoni,96.0,3,2,FLAT,435.0
4,Sagrada Família,84.0,2,1,FLAT,410.0


# One-Hot Encoding of Categorical Features

In [16]:
train_encoded = pd.get_dummies(train_clean, columns=['Location', 'Type'], drop_first=True)
train_encoded.head()

Unnamed: 0,Size,Bedrooms,Bathrooms,Price,Location_Diagonal Mar i el Front Marítim del Poblenou,Location_Dreta de l´Eixample,Location_El Camp de l´Arpa del Clot,Location_El Clot,Location_El Parc i la Llacuna del Poblenou,Location_Fort Pienc,...,Location_Poblenou,Location_Provençals del Poblenou,Location_Sagrada Família,Location_Sant Antoni,Type_DUPLEX,Type_FLAT,Type_GROUND_FLOOR,Type_LOFT,Type_PENTHOUSE,Type_STUDIO
0,85.0,2,1,320.0,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
1,65.0,2,1,335.0,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,77.0,2,1,330.0,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,96.0,3,2,435.0,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
4,84.0,2,1,410.0,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False


# Extract Text Features: Title and Description

In [17]:
def get_title_desc(data):
    title = []
    desc = [] # description
    for diction in data:
        desc.append(diction['desc'])
        title.append(diction.get('title', '')) 
    return title, desc

In [18]:
title, desc = get_title_desc(dataset)
title_test, desc_test = get_title_desc(test_data)

In [19]:
title[0], desc[0]

('Piso Tallers. Piso con 2 habitaciones con ascensor',
 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.')

# Embedding Title and Description with Flan-T5

In [20]:
from transformers import T5Tokenizer, T5EncoderModel

In [21]:
model_name = "google/flan-t5-small"
model = T5EncoderModel.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
# extracting a fixed-size vector representation of the input text
def get_sentence_encoding(text, model=model):
    input_ids = tokenizer(text, truncation=True, return_tensors="pt").input_ids
    with torch.no_grad():
        outputs = model(input_ids, return_dict=True)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1)
    return sentence_embedding[0].numpy()

In [23]:
title_embedded = np.vstack([get_sentence_encoding(x.strip()) for x in title])
title_embedded.shape

(866, 512)

In [24]:
desc_embedded = np.vstack([get_sentence_encoding(x.strip()) for x in desc])
desc_embedded.shape

(866, 512)

In [25]:
# convert to DataFrame
df_desc_embedded = pd.DataFrame(desc_embedded)
num_cols = title_embedded.shape[1]  
col_names = [f"t{i}" for i in range(1, num_cols + 1)]
df_title_embedded = pd.DataFrame(title_embedded, columns=col_names)

# Combine Featrues

In [26]:
train = pd.concat([df_desc_embedded, train_encoded], axis=1)
train.shape

(866, 536)

# XGBoost Model

In [27]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
y = train['Price']
X = train.drop(['Price'], axis=1)
X.shape

(866, 535)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape, X_train.shape

((866, 535), (692, 535))

In [30]:
X_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Location_Poblenou,Location_Provençals del Poblenou,Location_Sagrada Família,Location_Sant Antoni,Type_DUPLEX,Type_FLAT,Type_GROUND_FLOOR,Type_LOFT,Type_PENTHOUSE,Type_STUDIO
818,-0.071154,-0.030158,0.019559,-0.010458,0.06655,0.082316,0.001007,-0.132238,0.028664,-0.06548,...,False,False,False,False,False,True,False,False,False,False
558,-0.036223,-0.040941,0.029683,-0.019874,0.040049,0.036451,0.001065,-0.085019,0.009831,-0.058453,...,False,False,False,False,False,True,False,False,False,False


# Find the Best Parameters 

In [31]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBRegressor()
# Define the hyperparameters grid
param_grid = {
    'max_depth': [2, 3, 5],
    'learning_rate': [0.01, 0.05,],
    'n_estimators': [200, 400, 600],
    'gamma': [0, 0.1],
    'reg_lambda': [0.03, 0.05, 0.1],
    'min_child_weight': [3, 5, 7, 10]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_result = grid_search.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_result.best_params_)



Best Hyperparameters:  {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 2, 'min_child_weight': 5, 'n_estimators': 400, 'reg_lambda': 0.05}


# Prediction

In [32]:
# Extract the best parameters from grid search results
best_params = grid_result.best_params_

# Define the XGBoost model with the best hyperparameters
best_model = xgb.XGBRegressor(**best_params)

# Train the model with the best hyperparameters
best_model.fit(X_train, y_train)

# Make predictions with the trained model
y_pred = best_model.predict(X_val)

r2 = r2_score(y_val, y_pred)
print("R2:", r2)

R2: 0.6018284028355665


# Random Forest

In [33]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

In [41]:
X_train.columns = X_train.columns.astype(str)
X_val.columns = X_val.columns.astype(str)

In [37]:
param_dist = {
    'min_samples_leaf': np.arange(1, 15),
    'max_features': np.linspace(0.1, 0.8, 8),
    'max_depth': np.arange(7, 20)
}

cv = KFold(n_splits = 5, random_state = 13, shuffle = True)
rf = RandomForestRegressor(n_estimators=100)

random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=1000, cv=cv, n_jobs=-1,random_state = 13)
random_search.fit(X_train, y_train)

In [38]:
print("Best Parameters:", random_search.best_params_)
print("R^2 score",random_search.best_score_ )

Best Parameters: {'min_samples_leaf': 4, 'max_features': 0.7000000000000001, 'max_depth': 17}
R^2 score 0.4689619832344308


In [42]:
best_rf = RandomForestRegressor(**random_search.best_params_)
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_val)
r2 = r2_score(y_val, y_pred)
print("R2:", r2)

R2: 0.5500552068068321


# Test Data

In [127]:
title_test_embedded = np.vstack([get_sentence_encoding(x.strip()) for x in title_test])
title_test_embedded.shape

(132, 512)

In [133]:
desc_test_embedded = np.vstack([get_sentence_encoding(x.strip()) for x in desc_test])
desc_test_embedded.shape

(132, 512)

In [129]:
df_desc_test= pd.DataFrame(desc_test_embedded)
df_title_test = pd.DataFrame(title_test_embedded)

In [141]:
test_encoded = pd.get_dummies(test_clean, columns=['Location', 'Type'], drop_first=True)
test_encoded.shape

(132, 15)

In [152]:
encoded = train_encoded.drop(['Price'], axis=1)
encoded.shape,train_encoded.shape

((866, 23), (866, 24))

In [153]:
# Add missing columns to test_encoded and fill them with zeros
columns_to_add = encoded.columns.difference(test_encoded.columns)
for column in columns_to_add:
    test_encoded[column] = 0
# Reorder the columns in test_encoded to match the order of columns in X
test_encoded = test_encoded[encoded.columns]
test_encoded.shape, encoded.shape

((132, 23), (866, 23))

In [154]:
test = pd.concat([df_desc_test, test_encoded], axis=1)
test.shape, X_train.shape

((132, 535), (692, 535))

Prediction

In [159]:
best_model.fit(X, y)
y_pred = best_model.predict(test)

In [160]:
df = pd.DataFrame({'price': y_pred})
df['id'] = range(len(df))
df.set_index('id', inplace=True)
df.shape

(132, 1)

In [161]:
df.to_csv('submission.csv')