In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, to_date
from pyspark.sql import functions as F
from pyspark.sql.functions import month, year

In [2]:
spark = SparkSession.builder \
        .appName("Days on Market") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [3]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [4]:
# Convert string dates to date objects and calculate 'days_on_market'
df = df.withColumn("start_date", to_date(col("start_date"), "yyyy-MM-dd")) \
    .withColumn("end_date", to_date(col("end_date"), "yyyy-MM-dd")) \
    .withColumn("days_on_market", datediff(col("end_date"), col("start_date")))



Jak coś to tutaj wyrzucam wszystkie te, które są aktywne, bo one są zawarte jako "9999-12-31"

In [5]:
df = df.filter(col("end_date") != to_date(F.lit("9999-12-31"), "yyyy-MM-dd"))

In [6]:

df.select("id", "property_type", "start_date", "end_date", "days_on_market").show(20)


+------+-------------------+----------+----------+--------------+
|    id|      property_type|start_date|  end_date|days_on_market|
+------+-------------------+----------+----------+--------------+
|310340|             Office|2020-04-27|2020-05-10|            13|
|310341|          Apartment|2020-04-27|2020-05-10|            13|
|310342|Commercial Premises|2020-04-27|2020-05-10|            13|
|310343|Commercial Premises|2020-04-27|2020-05-10|            13|
|310344|Commercial Premises|2020-04-27|2020-05-10|            13|
|310345|Commercial Premises|2020-04-27|2020-05-10|            13|
|310346|Commercial Premises|2020-04-27|2020-05-10|            13|
|310347|Commercial Premises|2020-04-27|2020-05-10|            13|
|310348|Commercial Premises|2020-04-27|2020-05-10|            13|
|310349|Commercial Premises|2020-04-27|2020-05-10|            13|
|310350|              House|2020-04-27|2020-05-10|            13|
|310351|              House|2020-04-27|2020-05-10|            13|
|310352|  

Czy to jeszcze nie ten etap? XD Nie do końca rozumiem, kiedy mogę agregować, a kiedy nie, na razie dodaję same ilości dni

In [7]:
result = df.groupBy("property_type").agg(
    F.mean("days_on_market").alias("average_days"),
    F.median("days_on_market").alias("median_days")
)

In [8]:
result.show(50)

+-------------------+------------------+-----------+
|      property_type|      average_days|median_days|
+-------------------+------------------+-----------+
|          Apartment| 57.81203831479828|       30.0|
|      Village House|55.202749140893474|       33.0|
|             Office| 62.79881280474878|       26.0|
|Commercial Premises|62.710833733397344|       30.0|
|              Other|22.298443450290236|        1.0|
|            Terrain|  71.4363674090469|       37.0|
|             Garage| 66.38403614457832|       30.0|
|          Warehouse|59.057755775577554|       24.0|
|              House| 60.06178439592067|       30.0|
|Horizontal Property| 67.50438934236871|       34.0|
+-------------------+------------------+-----------+



In [9]:
database_name = "Data_Clean"
table_name = "DaysOnMarket"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()