In [12]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, to_date
from pyspark.sql import functions as F
from pyspark.sql.functions import month, year

In [13]:
spark = SparkSession.builder \
        .appName("Days on Market") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [14]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [18]:
# Convert string dates to date objects and calculate 'days_on_market'
df = df.withColumn("start_date", to_date(col("start_date"), "yyyy-MM-dd")) \
    .withColumn("end_date", to_date(col("end_date"), "yyyy-MM-dd")) \
    .withColumn("days_on_market", datediff(col("end_date"), col("start_date")))



Jak coś to tutaj wyrzucam wszystkie te, które są aktywne, bo one są zawarte jako "9999-12-31"

In [19]:
df = df.filter(col("end_date") != to_date(F.lit("9999-12-31"), "yyyy-MM-dd"))

In [20]:

df.select("id", "property_type", "start_date", "end_date", "days_on_market").show(20)


+--------------------+-------------------+----------+----------+--------------+
|                  id|      property_type|start_date|  end_date|days_on_market|
+--------------------+-------------------+----------+----------+--------------+
|+dnVA1K6JxzL1zAjO...|            Terrain|2020-12-25|2020-12-29|             4|
|7YggCZumU655Hth4A...|          Apartment|2020-12-25|2021-01-07|            13|
|dLHXKN5/sRZpm9Yk0...|            Terrain|2020-12-25|2020-12-29|             4|
|TOZstKBOcHY5+Ww9m...|          Warehouse|2020-12-25|2020-12-30|             5|
|2LYMXj1wkHEcCjXc7...|          Apartment|2020-12-25|2021-03-05|            70|
|rcT6ivp90Myiw7JT3...|          Apartment|2021-01-01|2021-01-16|            15|
|aCmXfuqY+Dm7TeLby...|Commercial Premises|2021-01-01|2021-01-02|             1|
|0Shyi5ebmoPoRaGTN...|          Apartment|2020-12-25|2021-01-31|            37|
|4lkmDj46g2taeoHQ7...|            Terrain|2020-10-12|2020-10-25|            13|
|a2fx8NUh8Tbbma+t/...|              Othe

Czy to jeszcze nie ten etap? XD Nie do końca rozumiem, kiedy mogę agregować, a kiedy nie, na razie dodaję same ilości dni

In [22]:
result = df.groupBy("property_type").agg(
    F.mean("days_on_market").alias("average_days"),
    F.median("days_on_market").alias("median_days")
)

In [23]:
result.show(50)

+-------------------+------------------+-----------+
|      property_type|      average_days|median_days|
+-------------------+------------------+-----------+
|          Apartment| 54.01562143346268|       29.0|
|      Village House| 61.63137254901961|       29.0|
|             Office| 55.59487569761542|       25.0|
|Commercial Premises| 59.40686364470545|       29.0|
|              Other|15.510595194085028|        1.0|
|            Terrain| 65.20711474650822|       36.0|
|             Garage|  57.4502698535081|       29.0|
|          Warehouse|  64.2950226244344|       29.0|
|              House| 53.15666343477309|       30.0|
|Horizontal Property| 63.57030270463908|       31.0|
+-------------------+------------------+-----------+


In [24]:
database_name = "Data_Clean"
table_name = "DaysOnMarket"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()