In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import pyspark.pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.sql.types as types



In [2]:
spark = SparkSession.builder \
        .appName("Uncovered Area") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [3]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [4]:
df = df.filter(~col("property_type").isin("Terrain"))

In [5]:
df = df.withColumn("surface_total", col("surface_total").cast("int"))
df = df.withColumn("surface_covered", col("surface_covered").cast("int"))

In [6]:
def calculate_uncovered_area(surface_total, surface_covered):
    if surface_total is None or surface_covered is None:
        return None
    elif surface_total <= surface_covered:
        return None
    else:
        return surface_total - surface_covered
    
calculate_uncovered_area_udf = F.udf(calculate_uncovered_area)

In [7]:
df = df.withColumn("surface_area_uncovered", calculate_uncovered_area_udf(col("surface_total"), col("surface_covered")))
df = df.filter(col("surface_area_uncovered").isNotNull())

In [8]:
df.select("id", "property_type", "surface_total", "surface_covered", "surface_area_uncovered").show(20)

+------+-------------------+-------------+---------------+----------------------+
|    id|      property_type|surface_total|surface_covered|surface_area_uncovered|
+------+-------------------+-------------+---------------+----------------------+
|314458|          Apartment|          201|            166|                    35|
|314459|          Apartment|          140|            120|                    20|
| 60530|          Warehouse|         1378|            300|                  1078|
| 62634|              House|          480|            219|                   261|
| 62635|          Apartment|           92|             88|                     4|
|811787|              House|          208|             75|                   133|
|312388|Horizontal Property|          193|            127|                    66|
| 62636|          Apartment|          193|            164|                    29|
|312389|              House|          660|            205|                   455|
| 62637|        

In [9]:
database_name = "Data_Clean"
table_name = "Uncovered_Area"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"
df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()