In [1]:
import findspark
from pyspark.sql import SparkSession

findspark.init()
spark = SparkSession\
        .builder\
        .master("local[2]")\
        .appName("Spark MLlib")\
        .getOrCreate()
spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/25 10:26:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/25 10:26:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Inicjalizacja API Kaggle
api = KaggleApi()

# Definiowanie nazwy zbioru i ścieżki
dataset_name = "amrahhasanov23/otodom-pl-flat-prices-in-poland"
download_path = os.getcwd()  # Pobranie danych do bieżącego katalogu

# Pobieranie danych
api.dataset_download_files(dataset_name, path=download_path, unzip=True)

Dataset URL: https://www.kaggle.com/datasets/amrahhasanov23/otodom-pl-flat-prices-in-poland


In [4]:
# Sprawdzenie nagłówków pliku
!head -5 Otodom_Flat_Listings.csv

Title,Price,Location,Surface,Number_of_Rooms,Floor,Finishing_Condition,Heating,Parking_Space,Balcony_Garden_Terrace,Link,Voivodeship,City
2 pokoje 47m2 po remoncie + garderoba + balkon,415000.0,"ul. Marysińska, Stare Bałuty, Bałuty, Łódź, łódzkie",47.0,2 ,,do zamieszkania,miejskie,garaż/miejsce parkingowe,balkon,https://www.otodom.pl/pl/oferta/2-pokoje-47m2-po-remoncie-garderoba-balkon-ID4nlGC,Łódzkie,Łódź
Właściciel- Ludwiki DD - WIDOK NA ZIELEŃ - 2x gar,2499000.0,"ul. Ludwiki, Czyste, Wola, Warszawa, mazowieckie",105.0,4 ,2/8,do wykończenia,miejskie,garaż/miejsce parkingowe,balkon,https://www.otodom.pl/pl/oferta/wlasciciel-ludwiki-dd-widok-na-zielen-2x-gar-ID4mYBj,Mazowieckie,Warszawa
"Sprzedam mieszkanie 60m2, 2 balkony, garderoba",649000.0,"ul. Londyńska, Gorzów Wielkopolski, lubuskie",60.0,3 ,4/4,do zamieszkania,miejskie,,"balkon, taras",https://www.otodom.pl/pl/oferta/sprzedam-mieszkanie-60m2-2-balkony-garderoba-ID4nNUL,Lubuskie,Gorzów Wielkopolski
"Wyjątkowy, duży apartament z a

In [5]:
df = spark.read.csv("./Otodom_Flat_Listings.csv", header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: string (nullable = true)
 |-- Number_of_Rooms: string (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)



In [7]:
from pyspark.sql.functions import col

# Konwersja kolumn 'Surface', 'Number_of_Rooms', 'Floor' na float
df = df.withColumn("Surface", col("Surface").cast("float")) \
       .withColumn("Number_of_Rooms", col("Number_of_Rooms").cast("int")) \
       .withColumn("Floor", col("Floor").cast("string"))

In [8]:
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: float (nullable = true)
 |-- Number_of_Rooms: integer (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)



In [9]:
df = df.fillna({
    'Surface': df.agg({'Surface': 'avg'}).collect()[0][0],
    'Number_of_Rooms': df.agg({'Number_of_Rooms': 'avg'}).collect()[0][0],
    'Price': df.agg({'Price': 'avg'}).collect()[0][0],
    'Heating': 'Unknown',
    'Balcony_Garden_Terrace': 'Unknown',
    'Voivodeship': 'Unknown',
    'City': 'Unknown',
    'Floor': 'Unknown'
})

In [10]:
from pyspark.ml.feature import StringIndexer

# Konwertowanie zmiennych kategorycznych na liczby
string_columns = ['Heating', 'Balcony_Garden_Terrace', 'Voivodeship', 'City', 'Floor']

def convert_string_columns(df, string_columns):
    for col_name in string_columns:
        indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index", handleInvalid="skip")
        df = indexer.fit(df).transform(df)
    return df

df = convert_string_columns(df, string_columns)

In [11]:
# Wybór cech (features)
feature_columns = ['Surface', 'Number_of_Rooms', 'Heating_index', 'Balcony_Garden_Terrace_index', 
                   'Voivodeship_index', 'City_index', 'Floor_index']

In [12]:
from pyspark.ml.feature import VectorAssembler

# Tworzenie wektora cech
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [13]:
# Kolumna celu
label_col = "Price"

In [14]:
from pyspark.ml.regression import RandomForestRegressor

# Model RandomForest
rf = RandomForestRegressor(featuresCol="features", labelCol=label_col, maxBins=500)

In [15]:
from pyspark.ml import Pipeline

# Tworzymy pipeline
pipeline = Pipeline(stages=[assembler, rf])

In [16]:
# Podział na dane treningowe i testowe
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

In [17]:
# Dopasowanie modelu
model = pipeline.fit(df_train)

In [18]:
# Przewidywania na zbiorze testowym
predictions = model.transform(df_test)

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

# Ocena modelu - RMSE
evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

In [20]:
print(f"RMSE: {rmse}")

RMSE: 9669423.213467956


In [22]:
# Zapisz tylko kolumny prediction i Price do pliku CSV
predictions.select("prediction", "Price").write.csv("experiment_results.csv", header=True)

In [23]:
# Eksperyment 1: Tylko cechy numeryczne
selected_features_1 = ["Surface", "Number_of_Rooms"]

assembler = VectorAssembler(inputCols=selected_features_1, outputCol="features")
df = assembler.transform(df)

train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)

model = rf.fit(train_data)
predictions = model.transform(test_data)

rmse = evaluator.evaluate(predictions)
print(f"RMSE for Experiment 1: {rmse}")

RMSE for Experiment 1: 6167195.447187548


In [26]:
# Usuwanie kolumn, jeśli już istnieją
df = df.drop("Voivodeship_index", "City_index", "features")

# Kodowanie zmiennych kategorycznych
indexer_voivodeship = StringIndexer(inputCol="Voivodeship", outputCol="Voivodeship_index")
indexer_city = StringIndexer(inputCol="City", outputCol="City_index")

# Aplikowanie StringIndexer
df = indexer_voivodeship.fit(df).transform(df)
df = indexer_city.fit(df).transform(df)

selected_features_2 = ["Surface", "Number_of_Rooms", "Voivodeship_index", "City_index"]
assembler = VectorAssembler(inputCols=selected_features_2, outputCol="features")
df = assembler.transform(df)

train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)

model = rf.fit(train_data)
predictions = model.transform(test_data)

rmse = evaluator.evaluate(predictions)
print(f"RMSE for Experiment 2: {rmse}")

RMSE for Experiment 2: 8625881.46598957


In [31]:
from pyspark.ml.feature import OneHotEncoder

# Usuwanie kolumn, jeśli już istnieją
df = df.drop("Voivodeship_index", "City_index", "features", "Voivodeship_ohe", "City_ohe")

# Indeksowanie zmiennych kategorycznych
indexer_voivodeship = StringIndexer(inputCol="Voivodeship", outputCol="Voivodeship_index")
indexer_city = StringIndexer(inputCol="City", outputCol="City_index")

df = indexer_voivodeship.fit(df).transform(df)
df = indexer_city.fit(df).transform(df)

# OneHotEncoder
encoder_voivodeship = OneHotEncoder(inputCol="Voivodeship_index", outputCol="Voivodeship_ohe")
encoder_city = OneHotEncoder(inputCol="City_index", outputCol="City_ohe")

df = encoder_voivodeship.fit(df).transform(df)
df = encoder_city.fit(df).transform(df)

selected_features_3 = ["Surface", "Number_of_Rooms", "Voivodeship_ohe", "City_ohe"]
assembler = VectorAssembler(inputCols=selected_features_3, outputCol="features")
df = assembler.transform(df)

train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)

model = rf.fit(train_data)
predictions = model.transform(test_data)

rmse = evaluator.evaluate(predictions)
print(f"RMSE for Experiment 3: {rmse}")

RMSE for Experiment 3: 6289448.386006473


In [32]:
from pyspark.ml.regression import LinearRegression

# Linear Regression
lr = LinearRegression(featuresCol="features", labelCol="Price")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

lr_rmse = evaluator.evaluate(lr_predictions)
print(f"RMSE for Linear Regression: {lr_rmse}")

24/12/25 10:29:26 WARN Instrumentation: [e69d2f17] regParam is zero, which might cause numerical instability and overfitting.
24/12/25 10:29:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/25 10:29:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/12/25 10:29:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/12/25 10:29:26 WARN Instrumentation: [e69d2f17] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


RMSE for Linear Regression: 6324574.014895729
