In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
import pyspark.pandas as pd
from pyspark.sql import functions as F
import pyspark.sql.types as types



In [2]:
spark = SparkSession.builder \
        .appName("Area surfaces with rooms and property type") \
        .config("spark.jars", "../jdbc/mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

In [3]:
server_name = "mssql"
port = "1433"
database_name = "Data"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "ARProperties"
username = "SA"
password = "YourStrongPassword123"

df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

In [4]:
df = df.select(df.property_type, df.surface_total, df.surface_covered, df.rooms, df.bedrooms, df.bathrooms)

In [5]:
df = df.filter(~col("property_type").isin("Terrain"))

In [6]:
def fix_rooms(rooms, bathrooms, bedrooms):
    fix = rooms
    if not(None in [rooms, bathrooms, bedrooms]):
        if rooms < bathrooms+bedrooms:
            fix = bathrooms+bedrooms
    
    return fix

def fix_bathrooms(bathrooms):
    if bathrooms == 0 or bathrooms == None:
        return 1

def fix_bedrooms(bedrooms):
    if bedrooms == None:
        return 0

fix_bathrooms = F.udf(fix_bathrooms, types.IntegerType())
fix_rooms = F.udf(fix_rooms, types.IntegerType())

In [7]:
df = df.withColumns({
    "rooms":col("rooms").cast("integer"),
    "bathrooms":col("bathrooms").cast("integer"),
    "bedrooms":col("bedrooms").cast("integer")
})

In [8]:
df = df.withColumn("bathrooms", fix_bathrooms(col("bathrooms")))
df = df.withColumn("rooms", fix_rooms(col("rooms"), col("bathrooms"), col("bedrooms")))

In [9]:
df.show(20)

+-------------+-------------+---------------+-----+--------+---------+
|property_type|surface_total|surface_covered|rooms|bedrooms|bathrooms|
+-------------+-------------+---------------+-----+--------+---------+
|    Apartment|         40.0|           40.0|    2|       1|     NULL|
|       Office|       1300.0|         1300.0| NULL|    NULL|     NULL|
|    Apartment|         NULL|           NULL|    2|    NULL|     NULL|
|    Apartment|         40.0|           40.0|    2|       1|     NULL|
|        Other|      18164.0|        18164.0| NULL|    NULL|        1|
|    Apartment|         40.0|           40.0|    2|       1|     NULL|
|       Office|        728.0|          728.0| NULL|    NULL|     NULL|
|    Apartment|         45.0|           42.0|    2|       1|     NULL|
|       Office|         NULL|         1980.0| NULL|    NULL|     NULL|
|       Office|        728.0|          728.0| NULL|    NULL|     NULL|
|       Office|         NULL|         1980.0| NULL|    NULL|     NULL|
|    A

In [10]:
df.groupBy("property_type") \
           .agg(count(when(col("surface_total").isNull(), True)).alias("null_count"),
                count(when(col("surface_total").isNotNull(), True)).alias("not_null_count")).show()

+-------------------+----------+--------------+
|      property_type|null_count|not_null_count|
+-------------------+----------+--------------+
|          Apartment|     43777|         48028|
|      Village House|       169|           196|
|             Office|      1823|          3721|
|Commercial Premises|      3200|          4710|
|              Other|     67643|          1807|
|             Garage|      1145|           727|
|          Warehouse|       617|           884|
|              House|     23012|         21398|
|Horizontal Property|      4093|          4058|
+-------------------+----------+--------------+
