In [115]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
import pyspark.sql.types as types

In [116]:
spark = SparkSession.builder \
        .appName("Salary vs. Price") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [117]:
def convert_to_common_currency(currency, price):
    factors_to_usd = {
        "ARS": 0.00111,
        "USD": 1,
        "UYU": 0.026,
        "PEN": 0.27
    }
    return round(factors_to_usd[currency] * price, 2)
convert_to_common_currency = F.udf(convert_to_common_currency, FloatType())

In [118]:
def spark_read_from_db(table_name):
    server_name = "mssql"
    port = "1433"
    database_name = "Data"
    url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
    username = "SA"
    password = "YourStrongPassword123"
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    return df

In [119]:
properties = spark_read_from_db("ARProperties")
clients = spark_read_from_db("ARClients")
properties_clients = spark_read_from_db("PropertiesClients")
join_condition_properties = properties["id"] == properties_clients["id_prop"]
join_condition_clients = clients["id"] == properties_clients["id_client"]

joined_df = properties_clients.join(clients, join_condition_clients) \
    .join(properties, join_condition_properties)
columns_to_drop = ["id"]
joined_df = joined_df.drop(*columns_to_drop)

In [120]:
joined_df.show(10)

+-------+---------+----------------+---------+-------------+--------------------+--------------------+-----------------+--------------------+------------------+------+---------+----------+----------+----------+--------------+--------------+---------+--------------------+--------------------+--------+------+-----+--------+---------+-------------+---------------+--------+--------+------------+--------------------+-------------+--------------+
|id_prop|id_client|      first_name|last_name|date_of_birth|               email|             address|annual_income_usd|                 job|number_of_children|   sex|education|start_date|  end_date|created_on|      latitude|     longitude|  country|            province|                city|district|estate|rooms|bedrooms|bathrooms|surface_total|surface_covered|   price|currency|price_period|               title|property_type|operation_type|
+-------+---------+----------------+---------+-------------+--------------------+--------------------+--------

In [121]:
joined_df = joined_df.filter(col("operation_type").isin("Sale"))
joined_df = joined_df.filter(~col("property_type").isin("Terrain"))

In [122]:
df = joined_df.na.drop(subset=["currency", "price", "property_type"])


In [123]:
df = df.withColumn("common_currency_price", convert_to_common_currency(col("currency"), col("price")))
df = df.filter(col("common_currency_price").isNotNull())

In [124]:
df = df.withColumn("months", (col("common_currency_price"))/(col("annual_income_usd")/(12*35)))

In [125]:
df.select("id_client", "job", "city" , "common_currency_price", "annual_income_usd", "months").show(20)

+---------+--------------------+------------------+---------------------+-----------------+------------------+
|id_client|                 job|              city|common_currency_price|annual_income_usd|            months|
+---------+--------------------+------------------+---------------------+-----------------+------------------+
|   132792|Education officer...|          La Plata|              48000.0|           723000|27.883817427385893|
|   653630|      Energy manager|           Pinamar|              69000.0|           577000| 50.22530329289428|
|   704166|      Science writer|   Lomas de Zamora|             170000.0|           595000|             120.0|
|   321052|Publishing rights...|             Tigre|             125000.0|           381000| 137.7952755905512|
|   398460|    Network engineer|         Ituzaingó|             135000.0|           932000| 60.83690987124463|
|   527947|           Economist|             Morón|              75000.0|           383000| 82.24543080939948|
|

In [None]:
database_name = "Data_Clean"
table_name = "HowManySalaries"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()