Data Preparation

In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName("CSV Import") \
    .getOrCreate()

file_path = "C:/Users/afili/Desktop/Uni/Mestrado/AASE/apartments_for_rent_classified_100K.csv"

df = spark.read.csv(file_path, header=True, inferSchema=True, sep=";")

# Especificando valores que devem ser tratados como NaN
na_values = ["NA", "null", "", " "]

from pyspark.sql.functions import col

null_count = df.filter(col('price').isNull()).count()
print(f"Número de valores nulos na coluna 'price': {null_count}")

# Remover as linhas onde 'price' é nulo
df = df.filter(col('price').isNotNull())

null_count = df.filter(col('price').isNull()).count()
print(f"Número de valores nulos na coluna 'price': {null_count}")

print(f"Número de linhas: {df.count()}") 

Número de valores nulos na coluna 'price': 7
Número de valores nulos na coluna 'price': 0
Número de linhas: 99993


Demo da tabela e Mundança de tipos de colunas

In [5]:
df.show()

+----------+--------------------+--------------------+--------------------+---------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+--------------------+-------------+-----+--------+---------+---------+----------+
|        id|            category|               title|                body|amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price|price_display|price_type|square_feet|             address|     cityname|state|latitude|longitude|   source|      time|
+----------+--------------------+--------------------+--------------------+---------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+--------------------+-------------+-----+--------+---------+---------+----------+
|5668640009|housing/rent/apar...|One BR 507 & 509 ...|This unit is loca...|     null|        1|       1|     USD| No|Thumbnail|        Cats| 2195|       $2,195|   Monthly|        542|  507  509 Esplanade|R

In [6]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- amenities: string (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- fee: string (nullable = true)
 |-- has_photo: string (nullable = true)
 |-- pets_allowed: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_display: string (nullable = true)
 |-- price_type: string (nullable = true)
 |-- square_feet: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- cityname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- source: string (nullable = true)
 |-- time: integer (nullable = true)



In [7]:
from pyspark.sql.types import FloatType

columns_to_convert = ["bathrooms", "bedrooms", "latitude", "longitude"]  # Substitua pelos nomes das colunas

for column in columns_to_convert:
    df = df.withColumn(column, col(column).cast(FloatType()))

df.printSchema()


root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- amenities: string (nullable = true)
 |-- bathrooms: float (nullable = true)
 |-- bedrooms: float (nullable = true)
 |-- currency: string (nullable = true)
 |-- fee: string (nullable = true)
 |-- has_photo: string (nullable = true)
 |-- pets_allowed: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_display: string (nullable = true)
 |-- price_type: string (nullable = true)
 |-- square_feet: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- cityname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- source: string (nullable = true)
 |-- time: integer (nullable = true)



In [8]:
df = df.dropDuplicates(['id'])
df = df.filter(df['category'] == 'housing/rent/apartment')
df = df.filter(df['price_type'] == 'Monthly')

In [9]:
df.count()

99844

In [10]:
from pyspark.sql.functions import col, when
cities_to_keep = ['Dallas', 'Denver', 'Los Angeles', 'Las Vegas', 'Arlington', 'Atlanta', 'Charlotte']

df = df.withColumn(
    'cityname_transformed',
    when(col('cityname').isin(cities_to_keep), col('cityname')).otherwise('Other')
)

df.show()


+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+
|        id|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price|price_display|price_type|square_feet|address|      cityname|state|latitude|longitude|      source|      time|cityname_transformed|
+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+
|5121046702|housing/rent/apar...|Apartment in move...|If you are lookin...|                Pool|      1.0|     2.0|     USD| No|      Ye

In [11]:
from pyspark.sql.functions import when, col

df = df.withColumn(
    "pets_allowed_transformed",
    when(col("pets_allowed").isin("Cats", "Dogs", "Cats,Dogs", "Cats,Dogs,None"), "Yes").otherwise("No"))

df.show(20)


+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+------------------------+
|        id|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price|price_display|price_type|square_feet|address|      cityname|state|latitude|longitude|      source|      time|cityname_transformed|pets_allowed_transformed|
+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+------------------------+
|5121046702|housing/rent/apar...|Apartment in move...|If you 

In [12]:
df = df.withColumn(
    "has_photo_transformed",
    when(col("has_photo").isin("Yes", "Thumbnail"), "Yes").otherwise("No"))

In [13]:
df = df.fillna({"bathrooms": 0})
df = df.fillna({"bedrooms": 0})
df = df.dropna(subset=['price'])

In [14]:
df.select("bedrooms").distinct().show()
df.select("bathrooms").distinct().show()

+--------+
|bedrooms|
+--------+
|     9.0|
|     5.0|
|     7.0|
|     2.0|
|     3.0|
|     1.0|
|     6.0|
|     8.0|
|     4.0|
|     0.0|
+--------+

+---------+
|bathrooms|
+---------+
|      9.0|
|      5.0|
|      5.5|
|      2.5|
|      8.5|
|      7.0|
|      2.0|
|      3.0|
|      7.5|
|      1.5|
|      3.5|
|      1.0|
|      6.0|
|      8.0|
|      4.5|
|      4.0|
|      0.0|
+---------+



In [15]:
df_amenities = df.select('id', 'amenities', 'price')

display(df_amenities.show())

+----------+--------------------+-----+
|        id|           amenities|price|
+----------+--------------------+-----+
|5121046540|Cable or Satellit...| 1693|
|5121046702|                Pool|  910|
|5121046904|Cable or Satellit...|  999|
|5121047089|Fireplace,Patio/Deck|  808|
|5121047938|Fireplace,Patio/Deck| 1003|
|5121048112|Clubhouse,Gym,Int...| 1225|
|5121048460|Doorman,Gym,Parki...|11250|
|5121048804|AC,Clubhouse,Pool...|  800|
|5121048827|Cable or Satellit...| 1388|
|5121049457|                Pool|  935|
|5121050896|                Pool|  875|
|5121051413|Fireplace,Patio/Deck| 1163|
|5121051600|                Pool|  925|
|5121052248|                null| 2395|
|5121052344|   Fireplace,Storage| 1600|
|5121053008|    Gym,Parking,Pool| 2493|
|5121054112|Cable or Satellit...|  959|
|5121054159|Gym,Patio/Deck,Po...| 1587|
|5121054387|    Gym,Pool,Storage|  867|
|5121055139|Clubhouse,Gym,Poo...| 1425|
+----------+--------------------+-----+
only showing top 20 rows



None

In [16]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col

df_amenities = df_amenities.withColumn('amenities', F.when(F.col('amenities') == 'null', 'None').otherwise(F.col('amenities')))

In [17]:
df_amenities = df_amenities.withColumn('amenities', F.explode(F.split(F.col('amenities'), ',')))

display(df_amenities.show())

+----------+------------------+-----+
|        id|         amenities|price|
+----------+------------------+-----+
|5121046540|Cable or Satellite| 1693|
|5121046540|          Elevator| 1693|
|5121046540|               Gym| 1693|
|5121046540|   Internet Access| 1693|
|5121046540|           Parking| 1693|
|5121046540|                TV| 1693|
|5121046702|              Pool|  910|
|5121046904|Cable or Satellite|  999|
|5121046904|        Dishwasher|  999|
|5121046904|             Gated|  999|
|5121046904|               Gym|  999|
|5121046904|           Parking|  999|
|5121046904|        Patio/Deck|  999|
|5121046904|        Playground|  999|
|5121046904|              Pool|  999|
|5121046904|      Refrigerator|  999|
|5121046904|           Storage|  999|
|5121046904|            Tennis|  999|
|5121046904|      Washer Dryer|  999|
|5121047089|         Fireplace|  808|
+----------+------------------+-----+
only showing top 20 rows



None

In [18]:
df_amenities.select('amenities').distinct().toPandas()

pivot_amenities = df_amenities.groupBy("id", "price").pivot("amenities").agg(F.lit(1)).na.fill(0)

pivot_amenities = pivot_amenities.drop('None')

display(pivot_amenities.show())

+----------+-----+---+-----+----------+------------------+---------+----------+-------+--------+---------+----------------+-----+----+---+-------+---------------+------+-------+----------+----------+----+------------+-------+---+------+----+------------+-----------+
|        id|price| AC|Alarm|Basketball|Cable or Satellite|Clubhouse|Dishwasher|Doorman|Elevator|Fireplace|Garbage Disposal|Gated|Golf|Gym|Hot Tub|Internet Access|Luxury|Parking|Patio/Deck|Playground|Pool|Refrigerator|Storage| TV|Tennis|View|Washer Dryer|Wood Floors|
+----------+-----+---+-----+----------+------------------+---------+----------+-------+--------+---------+----------------+-----+----+---+-------+---------------+------+-------+----------+----------+----+------------+-------+---+------+----+------------+-----------+
|5121046540| 1693|  0|    0|         0|                 1|        0|         0|      0|       1|        0|               0|    0|   0|  1|      0|              1|     0|      1|         0|         0|

None

In [19]:
df_join = df.join(pivot_amenities, ['id', 'price'], "right")

display(df_join.show())

+----------+-----+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-------------+----------+-----------+-------+-----------+-----+--------+---------+------------+----------+--------------------+------------------------+---------------------+---+-----+----------+------------------+---------+----------+-------+--------+---------+----------------+-----+----+---+-------+---------------+------+-------+----------+----------+----+------------+-------+---+------+----+------------+-----------+
|        id|price|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price_display|price_type|square_feet|address|   cityname|state|latitude|longitude|      source|      time|cityname_transformed|pets_allowed_transformed|has_photo_transformed| AC|Alarm|Basketball|Cable or Satellite|Clubhouse|Dishwasher|Doorman|Elevator|Fireplace|G

None

In [20]:
from pyspark.sql.window import Window

df_classes = df_join.select('id','price').orderBy('price',ascending=True)

mediana = df_classes.count()/2

X = Window.orderBy('price')

df_classes = df_classes.withColumn('row_num', F.row_number().over(X))

df_classes = df_classes.withColumn(
    "class", 
    F.when(df_classes["row_num"] <= mediana, F.lit('B')).otherwise(F.lit('A'))
).drop("row_num")

In [22]:
df_join = df.join(df_classes, ['id', 'price'], "left")

display(df.count())

df = df_join.withColumnRenamed('class', 'price_class')

display(df.show())

99844

+----------+-----+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+------------------------+---------------------+-----------+
|        id|price|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price_display|price_type|square_feet|address|      cityname|state|latitude|longitude|      source|      time|cityname_transformed|pets_allowed_transformed|has_photo_transformed|price_class|
+----------+-----+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-------------+----------+-----------+-------+--------------+-----+--------+---------+------------+----------+--------------------+-------------------

None

In [23]:
df = df.join(pivot_amenities, ['id', 'price'], "right")

display(df.show())

+----------+-----+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-------------+----------+-----------+-------+-----------+-----+--------+---------+------------+----------+--------------------+------------------------+---------------------+-----------+---+-----+----------+------------------+---------+----------+-------+--------+---------+----------------+-----+----+---+-------+---------------+------+-------+----------+----------+----+------------+-------+---+------+----+------------+-----------+
|        id|price|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price_display|price_type|square_feet|address|   cityname|state|latitude|longitude|      source|      time|cityname_transformed|pets_allowed_transformed|has_photo_transformed|price_class| AC|Alarm|Basketball|Cable or Satellite|Clubhouse|Dishwasher|Door

None

In [None]:
from pyspark.sql.functions import col

ids_to_remove = [5648708029, 5667488036]  # Altere com os IDs que deseja excluir

df = df.filter(~col('id').isin(ids_to_remove))

In [None]:
df.to_csv(
    "C:/Users/afili/Desktop/Uni/Mestrado/AASE/apartments_for_rent_100k_final.csv", 
    sep=';', 
    index=False, 
    encoding='utf-8'
)