In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import pyspark.pandas as pd
from pyspark.sql import functions as F
import pyspark.sql.types as types

from pyspark.sql.functions import col, to_date
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [21]:
spark = SparkSession.builder \
        .appName("Price prediction") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate() 

In [22]:
server_name = "mssql"
port = "1433"
database_name = "Data_Clean"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "SaleApartmentUSD"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [25]:
data = df.withColumn("start_date", to_date(col("start_date"), "yyyy-MM-dd")) \
           .withColumn("end_date", to_date(col("end_date"), "yyyy-MM-dd")) \
           .withColumn("created_on", to_date(col("created_on"), "yyyy-MM-dd"))
data = data.select(["city", "rooms","surface_total", "common_currency_price"])
data = data.dropna()

In [26]:
data.count()

149685

In [27]:
categorical_columns = ['city']
indexers = [StringIndexer(inputCol=column, outputCol=column + "_indexed") for column in categorical_columns]

assembler = VectorAssembler(
    inputCols=['rooms', 'surface_total'] + [column + "_indexed" for column in categorical_columns],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + [assembler])
data = pipeline.fit(data).transform(data)

# Select relevant columns for modeling
data = data.select("features", "common_currency_price")

# Show transformed data
data.show(5)

+----------------+---------------------+
|        features|common_currency_price|
+----------------+---------------------+
|[4.0,163.0,35.0]|             580000.0|
| [4.0,124.0,4.0]|             215000.0|
| [4.0,102.0,4.0]|             280000.0|
|  [4.0,95.0,4.0]|             283000.0|
| [5.0,90.0,96.0]|               9350.0|
+----------------+---------------------+
only showing top 5 rows



In [28]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

In [31]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor

# Initialize regression models
lr = LinearRegression(featuresCol='features', labelCol='common_currency_price')
dt = DecisionTreeRegressor(featuresCol='features', labelCol='common_currency_price')
rf = RandomForestRegressor(featuresCol='features', labelCol='common_currency_price')
gbt = GBTRegressor(featuresCol='features', labelCol='common_currency_price')

# Fit models
lr_model = lr.fit(train_data)
dt_model = dt.fit(train_data)
rf_model = rf.fit(train_data)
gbt_model = gbt.fit(train_data)

# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 2 has 294 values. Consider removing this and other categorical features with a large number of values, or add more training examples.

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate models
evaluator = RegressionEvaluator(
    labelCol="common_currency_price", predictionCol="prediction", metricName="rmse")

# Linear Regression Evaluation
lr_predictions = lr_model.transform(test_data)
lr_rmse = evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")

# Decision Tree Regression Evaluation
dt_predictions = dt_model.transform(test_data)
dt_rmse = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Regression RMSE: {dt_rmse}")

# Random Forest Regression Evaluation
rf_predictions = rf_model.transform(test_data)
rf_rmse = evaluator.evaluate(rf_predictions)
print(f"Random Forest Regression RMSE: {rf_rmse}")

# Gradient-Boosted Tree Regression Evaluation
gbt_predictions = gbt_model.transform(test_data)
gbt_rmse = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Tree Regression RMSE: {gbt_rmse}")


Linear Regression RMSE: 59894.553318937615
Decision Tree Regression RMSE: 45618.714804471914
Random Forest Regression RMSE: 64071.282617753764
Gradient-Boosted Tree Regression RMSE: 43209.96709261038
