In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

In [25]:
spark = SparkSession.builder \
        .appName("Sale Apartment by Country, Province, City") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [26]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
    "ARS": 0.0011,
    "USD": 1,
    "UYU": 0.026,
    "PEN": 0.27
    }
    return round(factors_to_usd[currency]*price,2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())


In [27]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [28]:
df = df.filter(df["property_type"] == "Apartment") \
                .filter(df["operation_type"] == "Sale")

In [29]:
df = df.na.drop(subset=["currency"])

In [30]:
df = df.withColumn(
    "common_currency_price",
    convert_to_common_currency(col("currency"), col("price"))
)

In [31]:
df = df.withColumn(
    "city", 
    when(df["city"].isNull(), df["province"]).otherwise(df["city"])
)


In [32]:
# aggregated = converted_price_df.filter(df["property_type"] == "Apartment") \
#     .groupBy("country", "province","city") \
#     .agg(F.avg("common_currency_price").alias("avg_apartment_price_usd")) \
#     .withColumn("avg_apartment_price_usd", F.round("avg_apartment_price_usd", 2))

In [33]:
df.show(20)

+------+----------+----------+----------+--------------+--------------+---------+--------------------+---------------+--------------------+------+-----+--------+---------+-------------+---------------+--------+--------+------------+--------------------+-------------+--------------+---------------------+
|    id|start_date|  end_date|created_on|      latitude|     longitude|  country|            province|           city|            district|estate|rooms|bedrooms|bathrooms|surface_total|surface_covered|   price|currency|price_period|               title|property_type|operation_type|common_currency_price|
+------+----------+----------+----------+--------------+--------------+---------+--------------------+---------------+--------------------+------+-----+--------+---------+-------------+---------------+--------+--------+------------+--------------------+-------------+--------------+---------------------+
|310341|2020-04-27|2020-05-10|2020-04-27|   -34.7536902|   -58.4075223|Argentina|Bs.A

In [34]:
database_name = "Data_Clean"
table_name = "SaleApartmentUSD"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()