In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col

In [2]:
findspark.init()

spark = SparkSession\
        .builder\
        .master("local[2]")\
        .appName("Spark MLlib")\
        .getOrCreate()
spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/02 17:36:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("data/Otodom_Flat_Listings.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: string (nullable = true)
 |-- Number_of_Rooms: string (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)

+--------------------+---------+--------------------+-------+---------------+-----+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|               City|
+------------------

In [4]:
df = df.na.drop()
indexer = StringIndexer(inputCol="City", outputCol="City_Index")
df_indexed = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCols=["City_Index"], outputCols=["City_OneHot"])
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

                                                                                

In [5]:
df_encoded = df_encoded.withColumn("Surface", col("Surface").cast("float"))
df_encoded = df_encoded.withColumn("Number_of_Rooms", col("Number_of_Rooms").cast("int"))

In [6]:
features = ["Surface", "Number_of_Rooms", "City_OneHot"]
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="skip")
df_final = assembler.transform(df_encoded).select("features", "Price")

In [7]:
train_data, test_data = df_final.randomSplit([0.8, 0.2])
lr = LinearRegression(featuresCol="features", labelCol="Price", regParam=0.1)
lrModel = lr.fit(train_data)

test_stats = lrModel.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"MSE: {test_stats.meanSquaredError}")

25/01/02 17:36:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/01/02 17:36:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/01/02 17:36:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


RMSE: 5721116.441732182
R2: 0.4060548844056562
MSE: 32731173339858.3
