In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType

In [3]:
spark = SparkSession.builder \
        .appName("Outlier Detection") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [4]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
        "ARS": 0.0011,
        "USD": 1,
        "UYU": 0.026,
        "PEN": 0.27
    }
    return round(factors_to_usd[currency] * price, 2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())

In [5]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [6]:
df = df.na.drop(subset=["currency", "price", "property_type"])


In [7]:
df = df.withColumn("common_currency_price", convert_to_common_currency(col("currency"), col("price")))
df = df.filter(col("common_currency_price").isNotNull())

Mediana i odchylenie dla każdego property type

In [8]:
stats_df = df.groupBy("property_type").agg(
    mean(col("common_currency_price")).alias("mean_price"),
    stddev(col("common_currency_price")).alias("stddev_price"))

In [9]:
df = df.join(stats_df, "property_type") \
       .withColumn("z_score", (col("common_currency_price") - col("mean_price")) / col("stddev_price"))

Użycie z-score'a

In [10]:
#outliers_df = df.filter(F.abs(col("z_score")) > 5)

In [11]:
#outlier_count = outliers_df.count()
#print(f"Number of outliers: {outlier_count}")

In [12]:
df.select("id", "country", "city", "common_currency_price", "z_score").show(20)

+---+---------+--------------------+---------------------+--------------------+
| id|  country|                city|common_currency_price|             z_score|
+---+---------+--------------------+---------------------+--------------------+
| 35|Argentina|             Córdoba|                 4.51| -0.4860335650258094|
| 36|Argentina|            Recoleta|                 16.5|-0.48598547117500995|
| 37|Argentina|          Villa Luro|                 26.4| -0.4859457606567733|
| 38|Argentina|        Barrio Norte|                18.15|  -0.485978852756579|
| 39|Argentina|Exaltación de la ...|               109.45| -0.5583783225926575|
| 40|Argentina|                NULL|                 16.5|-0.48598547117500995|
| 41|Argentina|             Quilmes|                 17.6|-0.48598105889350574|
| 42|Argentina|             Córdoba|                 16.5|-0.48598547117500995|
| 43|Argentina|             Córdoba|                 16.5|-0.48598547117500995|
| 44|Argentina|             Córdoba|    

In [14]:
database_name_outliers = "Data_Clean"
table_name_outliers = "PropertyPriceOutliers"
url_outliers = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name_outliers}"
df.write \
    .format("jdbc") \
    .option("url", url_outliers) \
    .option("dbtable", table_name_outliers) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()