In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
spark = SparkSession.builder \
    .config("spark.jars", "./postgresql-42.6.0.jar") \
    .appName("BDM2") \
    .getOrCreate()

url = "jdbc:postgresql://10.4.41.64:5432/bdmdb"
properties = {
    "user": "bdm",
    "password": "bdm",
    "driver": "org.postgresql.Driver"
}

table_name = "idealista_model"
df = spark.read.jdbc(url=url, table=table_name, properties=properties)

assembler = VectorAssembler(inputCols=[col for col in df.columns[:-1] ], outputCol="features")
assembler2 = VectorAssembler(inputCols=["price"], outputCol="target")
data = assembler.transform(df).select("features", "price")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)

train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

rf = RandomForestRegressor(featuresCol="features", labelCol="price", numTrees=30,maxDepth=10)
model = rf.fit(train_data)

In [None]:
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
pdf=predictions.toPandas()
pdf['dif']=np.abs(pdf['price']-pdf['prediction'])
# Convert the DataFrame column to a Pandas Series
pdf['ldif']=np.log10(pdf['dif']+1)

# Plot the histogram
plt.hist(pdf['ldif'], bins=30)  # Adjust the number of bins as needed
plt.xlabel("Log error")
plt.ylabel("Frequency")
plt.title("Histogram of " + "Log Error")

plt.show()