In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

In [4]:
spark = SparkSession.builder \
        .appName("Sale Apartment by Country, Province, City") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [5]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
    "ARS": 0.0011,
    "USD": 1,
    "UYU": 0.026,
    "PEN": 0.27
    }
    return round(factors_to_usd[currency]*price,2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())


In [6]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [7]:
df = df.filter(df["property_type"] == "Apartment") \
                .filter(df["operation_type"] == "Sale")

In [8]:
df = df.na.drop(subset=["currency"])

In [9]:
df = df.withColumn(
    "common_currency_price",
    convert_to_common_currency(col("currency"), col("price"))
)

In [10]:
df = df.select(
    df["country"],
    df["province"],
    df["city"],
    df["common_currency_price"]
)

In [11]:
df = df.withColumn(
    "city", 
    when(df["city"].isNull(), df["province"]).otherwise(df["city"])
)


In [12]:
# aggregated = converted_price_df.filter(df["property_type"] == "Apartment") \
#     .groupBy("country", "province","city") \
#     .agg(F.avg("common_currency_price").alias("avg_apartment_price_usd")) \
#     .withColumn("avg_apartment_price_usd", F.round("avg_apartment_price_usd", 2))

In [13]:
df.show(20)

+---------+---------------+------------+---------------------+
|  country|       province|        city|common_currency_price|
+---------+---------------+------------+---------------------+
|Argentina|Capital Federal|    Recoleta|             310000.0|
|Argentina|Capital Federal|   Caballito|             315000.0|
|Argentina|Capital Federal|   Caballito|             158500.0|
|Argentina|Capital Federal|   Chacarita|             240000.0|
|Argentina|Capital Federal|  Colegiales|             245000.0|
|Argentina|Capital Federal|Barrio Norte|             195000.0|
|Argentina|Capital Federal|Barrio Norte|             325000.0|
|Argentina|Capital Federal|Villa Crespo|             199000.0|
|Argentina|Capital Federal|Villa Crespo|             263329.0|
|Argentina|Capital Federal|Villa Crespo|             166600.0|
|Argentina|Capital Federal|        Once|             185000.0|
|Argentina|Capital Federal|      Retiro|             580000.0|
|Argentina|Capital Federal|     Almagro|             21

In [14]:
database_name = "Data_Clean"
table_name = "SaleApartmentUSD"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()