In [26]:
# import os
# os.environ['PYSPARK_PYTHON'] = 'python3.11'
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.11'

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("HouseRentPrediction").config("spark.driver.host", "10.48.200.233") .getOrCreate() # Use the local IP address

 
# Set the log level to ERROR
spark.sparkContext.setLogLevel("ERROR")

In [27]:
idealista_path = "model_temp/20240614-2306-idealista"
df_idealista = spark.read.parquet(idealista_path)
df_idealista.show(5)

+------------+--------------------+-----------------+---------+-----+--------+------------+---------+-----+--------+-----+---------+--------------------+---------+--------------------+------------------+-------+----------+---------+-----------+--------------------+--------+--------+------+--------------+-------+-----------+--------------------+--------------------+-------+---------+------+----------+-----------------+----------------------+------------------+------------+---------------------------------------------------------------------------------------------------------------------------+
|propertyCode|           thumbnail|externalReference|numPhotos|floor|   price|propertyType|operation| size|exterior|rooms|bathrooms|             address| province|        municipality|          district|country|  latitude|longitude|showAddress|                 url|distance|hasVideo|status|newDevelopment|hasLift|priceByArea|        detailedType|      suggestedTexts|hasPlan|has3DTour|has360|hasStag

## to explore the data

In [28]:
column_names = df_idealista.columns
print(column_names)

['propertyCode', 'thumbnail', 'externalReference', 'numPhotos', 'floor', 'price', 'propertyType', 'operation', 'size', 'exterior', 'rooms', 'bathrooms', 'address', 'province', 'municipality', 'district', 'country', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour', 'has360', 'hasStaging', 'topNewDevelopment', 'newDevelopmentFinished', 'parkingSpace', 'neighborhood', 'district,neighborhood,district_n_reconciled,district_n,district_id,neighborhood_n_reconciled,neighborhood_n,neighborhood_id']


In [37]:
unique_operations = df_idealista.select("operation").distinct()
unique_operations.show()

+---------+
|operation|
+---------+
|     sale|
+---------+



In [38]:
unique_operations = df_idealista.select("propertyType").distinct()
unique_operations.show()

+------------+
|propertyType|
+------------+
|   penthouse|
|      duplex|
|      studio|
|      chalet|
|        flat|
|countryHouse|
+------------+



In [39]:
unique_operations = df_idealista.select("exterior").distinct()
unique_operations.show()

+--------+
|exterior|
+--------+
|    true|
|   false|
+--------+



In [40]:
unique_operations = df_idealista.select("district").distinct()
unique_operations.show()

+--------------------+
|            district|
+--------------------+
|La Miranda - Can ...|
|           Collblanc|
|           Can Matas|
|                Golf|
|              Gràcia|
|            El Pedró|
|La Florida - Les ...|
|             El Gall|
|          Sant Martí|
| Roses - Castellbell|
|    Parc Empresarial|
|         Can Vidalet|
|       Can Sant Joan|
|Camps Blancs - Ca...|
|           Les Corts|
|         Granvia L-H|
|              Almeda|
|Sant Francesc-El ...|
|       Sant Ildefons|
|Mas Lluí - Els Mi...|
+--------------------+
only showing top 20 rows



In [41]:
unique_operations = df_idealista.select("status").distinct()
unique_operations.show()

+--------------+
|        status|
+--------------+
|         renew|
|newdevelopment|
|          good|
|          NULL|
+--------------+



In [57]:
unique_operations = df_idealista.select("bathrooms").distinct()
unique_operations.show()

+---------+
|bathrooms|
+---------+
|      8.0|
|      0.0|
|      7.0|
|      1.0|
|      4.0|
|     11.0|
|      3.0|
|     19.0|
|      2.0|
|     10.0|
|      6.0|
|      5.0|
|      9.0|
+---------+



# Model

Linear Regression with categorical variables. 

## the final model

without scaling the size 

In [65]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Drop duplicates
df_idealista = df_idealista.dropDuplicates()

# Prepare the DataFrame
df_idealista = df_idealista.dropna(subset=['price', 'size', 'rooms', 'bathrooms', 'status', 'exterior'])
df_idealista = df_idealista.withColumn("price", col("price").cast("double")) \
                           .withColumn("size", col("size").cast("double")) \
                           .withColumn("rooms", col("rooms").cast("integer")) \
                           .withColumn("bathrooms", col("bathrooms").cast("integer")) \
                           .withColumn("exterior", col("exterior").cast("integer"))   # exterior: true is mapped to 1 and false is mapped to 0

# StringIndexer and OneHotEncoder for 'status'
# Ensure no column name conflicts
if "status_indexed" in df_idealista.columns:
    df_idealista = df_idealista.drop("status_indexed")
status_indexer = StringIndexer(inputCol="status", outputCol="status_indexed")
df_idealista = status_indexer.fit(df_idealista).transform(df_idealista)

if "status_encoded" in df_idealista.columns:
    df_idealista = df_idealista.drop("status_encoded")
status_encoder = OneHotEncoder(inputCols=["status_indexed"], outputCols=["status_encoded"])
df_idealista = status_encoder.fit(df_idealista).transform(df_idealista)



if "features" in df_idealista.columns:
    df_idealista = df_idealista.drop("features")
assembler = VectorAssembler(inputCols=["size", "rooms", "bathrooms", "status_encoded", "exterior"], outputCol="features")
df_idealista = assembler.transform(df_idealista)

# Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="price")

# Data split for training and testing
train_data, test_data = df_idealista.randomSplit([0.8, 0.2], seed=1234)

# Model training
lr_model = lr.fit(train_data)

# Making predictions
predictions = lr_model.transform(test_data)

# Model evaluation
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# Displaying coefficients
coefficients = lr_model.coefficients
print("Coefficients:", coefficients)

Root Mean Squared Error (RMSE) on test data = 349028
Coefficients: [0.450335487142948,38132.70634222065,392435.04986314743,153063.20392982973,191678.20414232666,-7540.791973886098]


In [64]:
# scaling size 
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Drop duplicates based on all columns (if rows are completely identical)
df_idealista = df_idealista.dropDuplicates()

# Prepare the DataFrame
df_idealista = df_idealista.dropna(subset=['price', 'size', 'rooms', 'bathrooms', 'status', 'exterior'])
df_idealista = df_idealista.withColumn("price", col("price").cast("double")) \
                           .withColumn("size", col("size").cast("double")) \
                           .withColumn("rooms", col("rooms").cast("integer")) \
                           .withColumn("bathrooms", col("bathrooms").cast("integer")) \
                           .withColumn("exterior", col("exterior").cast("integer"))


# StringIndexer and OneHotEncoder for 'status'
if "status_indexed" in df_idealista.columns:
    df_idealista = df_idealista.drop("status_indexed")
status_indexer = StringIndexer(inputCol="status", outputCol="status_indexed")
df_idealista = status_indexer.fit(df_idealista).transform(df_idealista)

if "status_encoded" in df_idealista.columns:
    df_idealista = df_idealista.drop("status_encoded")
status_encoder = OneHotEncoder(inputCols=["status_indexed"], outputCols=["status_encoded"])
df_idealista = status_encoder.fit(df_idealista).transform(df_idealista)

# Scale only the 'size' feature
if "size_features" in df_idealista.columns:
    df_idealista = df_idealista.drop("size_features")
size_assembler = VectorAssembler(inputCols=["size"], outputCol="size_features")
size_df = size_assembler.transform(df_idealista)

if "scaled_size" in df_idealista.columns:
    df_idealista = df_idealista.drop("scaled_size")
scaler = StandardScaler(inputCol="size_features", outputCol="scaled_size", withStd=True, withMean=False)
df_idealista = scaler.fit(size_df).transform(size_df)

# Assemble other features
if "other_features" in df_idealista.columns:
    df_idealista = df_idealista.drop("other_features")
other_features_assembler = VectorAssembler(inputCols=["rooms", "bathrooms", "status_encoded", "exterior"], outputCol="other_features")
df_idealista = other_features_assembler.transform(df_idealista)

# Combine scaled size with other features
if "features" in df_idealista.columns:
    df_idealista = df_idealista.drop("features")
final_assembler = VectorAssembler(inputCols=["scaled_size", "other_features"], outputCol="features")
df_idealista = final_assembler.transform(df_idealista)

# Linear Regression Model using combined features
lr = LinearRegression(featuresCol="features", labelCol="price")

# Split the data
train_data, test_data = df_idealista.randomSplit([0.8, 0.2], seed=1234)

# Train the model
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# Display coefficients
coefficients = lr_model.coefficients
print("Coefficients:", coefficients)

Root Mean Squared Error (RMSE) on test data = 402209
Coefficients: [774.4352699404948,37961.09367814587,383370.4673475959,141776.6014140781,178639.25549421436,-8690.246685712456]
