In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

In [33]:
spark = SparkSession.builder \
        .appName("Sample PySpark SQL Server Connection") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [34]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
    "ARS": 0.0011,
    "USD": 1,
    "UYU": 0.026,
    "PEN": 0.27
    }
    return round(factors_to_usd[currency]*price,2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())


In [35]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [36]:
df = df.filter(df["property_type"] == "Apartment") \
                .filter(df["operation_type"] == "Sale")

In [37]:
df = df.na.drop(subset=["currency"])

In [38]:
df = df.withColumn(
    "common_currency_price",
    convert_to_common_currency(col("currency"), col("price"))
)

In [39]:
df = df.select(
    df["country"],
    df["province"],
    df["city"],
    df["common_currency_price"]
)

In [40]:
df = df.withColumn(
    "city", 
    when(df["city"].isNull(), df["province"]).otherwise(df["city"])
)


In [41]:
# aggregated = converted_price_df.filter(df["property_type"] == "Apartment") \
#     .groupBy("country", "province","city") \
#     .agg(F.avg("common_currency_price").alias("avg_apartment_price_usd")) \
#     .withColumn("avg_apartment_price_usd", F.round("avg_apartment_price_usd", 2))

In [42]:
df.show(20)

+---------+--------------------+---------------+---------------------+
|  country|            province|           city|common_currency_price|
+---------+--------------------+---------------+---------------------+
|Argentina|Bs.As. G.B.A. Zon...|Lomas de Zamora|             135000.0|
|Argentina|Bs.As. G.B.A. Zon...|          Lanús|              94000.0|
|Argentina|Bs.As. G.B.A. Zon...|          Lanús|              65000.0|
|Argentina|Bs.As. G.B.A. Zon...|          Lanús|              50000.0|
|Argentina|Buenos Aires Cost...|     Mar de Ajó|              60000.0|
|Argentina|Buenos Aires Cost...|  Mar del Plata|              75000.0|
|Argentina|Bs.As. G.B.A. Zon...|Almirante Brown|             120000.0|
|Argentina|Bs.As. G.B.A. Zon...|Lomas de Zamora|              90000.0|
|Argentina|Bs.As. G.B.A. Zon...|Lomas de Zamora|             100000.0|
|Argentina|Bs.As. G.B.A. Zon...|Lomas de Zamora|              83000.0|
|Argentina|Bs.As. G.B.A. Zon...|Lomas de Zamora|             140000.0|
|Argen

In [43]:
database_name = "Data_Clean"
table_name = "SaleApartmentUSD"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()