In [52]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType

In [54]:
spark = SparkSession.builder \
        .appName("Outlier Detection") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [55]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
        "ARS": 0.0011,
        "USD": 1,
        "UYU": 0.026,
        "PEN": 0.27
    }
    return round(factors_to_usd[currency] * price, 2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())

In [56]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [57]:
df = df.na.drop(subset=["currency", "price", "property_type"])


In [58]:
df = df.withColumn("common_currency_price", convert_to_common_currency(col("currency"), col("price")))
df = df.filter(col("common_currency_price").isNotNull())

Mediana i odchylenie dla każdego property type

In [59]:
stats_df = df.groupBy("property_type").agg(
    mean(col("common_currency_price")).alias("mean_price"),
    stddev(col("common_currency_price")).alias("stddev_price"))

In [60]:
df = df.join(stats_df, "property_type") \
       .withColumn("z_score", (col("common_currency_price") - col("mean_price")) / col("stddev_price"))

Użycie z-score'a

In [61]:
outliers_df = df.filter(F.abs(col("z_score")) > 5)

In [65]:
outlier_count = outliers_df.count()
print(f"Number of outliers: {outlier_count}")

Number of outliers: 1135


In [63]:
outliers_df.select("id", "country", "city", "common_currency_price", "z_score").show(20)

+--------------------+---------+--------------+---------------------+------------------+
|                  id|  country|          city|common_currency_price|           z_score|
+--------------------+---------+--------------+---------------------+------------------+
|0ZGBmcC2APh0c/Kdy...|Argentina|       Córdoba|            3990000.0| 15.51849115971297|
|OZUkT9TvNsWqdDGgT...|Argentina|       Palermo|            1980000.0| 7.456052297977117|
|KW/JsISYcoqVcrJ+C...|  Uruguay|Punta del Este|            3000000.0|11.547439183037103|
|qF1ortskmiYPt3fcA...|Argentina|       Palermo|            2200000.0| 8.338508292793977|
|JfshAiz67d0cKrxrp...|Argentina| Puerto Madero|            1400000.0| 5.129577402550851|
|YvQpq2WnoKilWkRDA...|Argentina|       Palermo|            2900000.0|11.146322821756712|
|UYtdNzuJvuKyGkIQz...|Argentina|       Palermo|            2500000.0| 9.541857376635148|
|cxrfS/cqkekz+j4jB...|Argentina|       Palermo|            1900000.0| 7.135159208952804|
|5wndrM9yIFdsGW0Tv...

In [64]:
database_name_outliers = "Data_Clean"
table_name_outliers = "PropertyPriceOutliers"
url_outliers = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name_outliers}"
outliers_df.write \
    .format("jdbc") \
    .option("url", url_outliers) \
    .option("dbtable", table_name_outliers) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()