In [None]:
from pyspark.sql import SparkSession, DataFrame

In [93]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.storagelevel import StorageLevel

In [None]:
from google.cloud import storage
from google.cloud import bigquery
import pandas as pd

In [None]:
# Abrimos la sesión de Spark que vamos a necesitar...

In [None]:
spark = SparkSession.builder \
    .appName("spark-learning") \
    .getOrCreate()

In [None]:
# Abrimos el storage_client para poder trabajar con GCS

In [11]:
# Crear una instancia del cliente de Google Cloud Storage
storage_client = storage.Client()

# Listar los buckets en el proyecto
buckets = list(storage_client.list_buckets())

In [12]:
buckets 

[<Bucket: airbnb_equifax>,
 <Bucket: dataproc-staging-europe-west4-662485454498-ndsis5lp>,
 <Bucket: dataproc-temp-europe-west4-662485454498-zcn4vjdp>]

In [None]:
bucket_name = "airbnb_equifax"

In [None]:
# Obtener la instancia del bucket
bucket = storage_client.get_bucket(bucket_name)

# Listar objetos en el bucket
blobs = list(bucket.list_blobs())

In [18]:
# Imprimir los paths de GCS
for blob in blobs:
    print(f'gs://{bucket_name}/{blob.name}')

gs://airbnb_equifax/calendar.csv
gs://airbnb_equifax/calendar.csv.gz
gs://airbnb_equifax/listings.csv
gs://airbnb_equifax/neighbourhoods.csv
gs://airbnb_equifax/reviews.csv


In [20]:
gcs_path = "gs://airbnb_equifax/calendar.csv"

In [None]:
# Elegimos leer desde GCS el archivo csv que contiene los datos de calendar

In [21]:
calendar = spark.read.csv(gcs_path, header=True, inferSchema=True)

                                                                                

In [22]:
calendar.show()

+----------+-------------------+---------+-------+--------------+--------------+--------------+
|listing_id|               date|available|  price|adjusted_price|minimum_nights|maximum_nights|
+----------+-------------------+---------+-------+--------------+--------------+--------------+
|    361053|2023-09-07 00:00:00|        f|$125.00|       $125.00|             2|           365|
|    361053|2023-09-08 00:00:00|        f|$125.00|       $125.00|             2|           365|
|    361053|2023-09-09 00:00:00|        f|$125.00|       $125.00|             2|           365|
|    361053|2023-09-10 00:00:00|        f|$101.00|       $101.00|             2|           365|
|    361053|2023-09-11 00:00:00|        f|$101.00|       $101.00|             2|           365|
|    361053|2023-09-12 00:00:00|        f|$101.00|       $101.00|             2|           365|
|    361053|2023-09-13 00:00:00|        f|$101.00|       $101.00|             2|           365|
|    361053|2023-09-14 00:00:00|        

                                                                                

In [25]:
# Crear una instancia del cliente de BigQuery
bq_client = bigquery.Client()

In [23]:
# Especificar la consulta de BigQuery
query = "SELECT * FROM `bigquery-learning-405922.airbnb.neighbourhoods`"

In [26]:
# Ejecutar la consulta en BigQuery
query_job = bq_client.query(query)
# Obtener los resultados de la consulta como un DataFrame de pandas
neighbourhoods = query_job.to_dataframe()

In [30]:
new_header = neighbourhoods.iloc[0] 
neighbourhoods = neighbourhoods[1:] 
neighbourhoods.columns = new_header 

In [31]:
neighbourhoods.head()

Unnamed: 0,neighbourhood_group,neighbourhood
1,Arganzuela,Acacias
2,Arganzuela,Atocha
3,Arganzuela,Chopera
4,Arganzuela,Delicias
5,Arganzuela,Imperial


In [None]:
del neighbourhoods

In [33]:
# Especificar el nombre completo de la tabla en formato proyecto.dataset.tabla
table_id = 'bigquery-learning-405922.airbnb.listings'

# Leer la tabla directamente en un DataFrame de Spark
listings = spark.read.format("bigquery").option("table", table_id).load()

In [42]:
listings.limit(3).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,921708393016270937,Rental unit in Madrid · 2 bedrooms · 2 beds · ...,18164736,Juan Manuel,Centro,Sol,40.419651,-3.701909,Entire home/apt,200,4,0,,,1,239,0,
1,16993980,Rental unit in Madrid · ★4.81 · 4 bedrooms · 7...,113826816,David Y Luis Miguel,Centro,Sol,40.41855,-3.69979,Entire home/apt,551,1,279,2023-08-31,3.48,1,299,53,VT-5014
2,31290141,Rental unit in Madrid · ★4.68 · 7 bedrooms · 1...,8124160,Flavia,Centro,Sol,40.41996,-3.70387,Entire home/apt,336,1,68,2023-08-15,1.24,10,262,24,


In [38]:
# Especificar el nombre completo de la tabla en formato proyecto.dataset.tabla
table_id = 'bigquery-learning-405922.airbnb.neighbourhoods'

# Leer la tabla directamente en un DataFrame de Spark
neighbourhoods = spark.read.format("bigquery").option("table", table_id).load()

In [39]:
# Especificar el nombre completo de la tabla en formato proyecto.dataset.tabla
table_id = 'bigquery-learning-405922.airbnb.calendar'

# Leer la tabla directamente en un DataFrame de Spark
calendar = spark.read.format("bigquery").option("table", table_id).load()

In [85]:
# Especificar el nombre completo de la tabla en formato proyecto.dataset.tabla
table_id = 'bigquery-learning-405922.airbnb.reviews'

# Leer la tabla directamente en un DataFrame de Spark
reviews = spark.read.format("bigquery").option("table", table_id).load()

In [48]:
all_configs = spark.sparkContext.getConf().getAll()
for config in all_configs:
    print(config)

('spark.eventLog.enabled', 'true')
('spark.dynamicAllocation.minExecutors', '1')
('spark.dataproc.sql.joinConditionReorder.enabled', 'true')
('spark.eventLog.dir', 'gs://dataproc-temp-europe-west4-662485454498-zcn4vjdp/037393d0-4992-4d9e-91cf-99e3863493a9/spark-job-history')
('spark.yarn.historyServer.address', 'cluster-f0de-m:18080')
('spark.dataproc.sql.local.rank.pushdown.enabled', 'true')
('spark.history.fs.logDirectory', 'gs://dataproc-temp-europe-west4-662485454498-zcn4vjdp/037393d0-4992-4d9e-91cf-99e3863493a9/spark-job-history')
('spark.yarn.unmanagedAM.enabled', 'true')
('spark.ui.filters', 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter')
('spark.sql.optimizer.runtime.bloomFilter.join.pattern.enabled', 'true')
('spark.metrics.namespace', 'app_name:${spark.app.name}.app_id:${spark.app.id}')
('spark.dataproc.sql.optimizer.join.fusion.enabled', 'true')
('spark.ui.proxyBase', '/proxy/application_1707781858408_0001')
('spark.driver.maxResultSize', '1024m')
('spark.datap

In [49]:
spark.conf.get("spark.executor.memory")

'2893m'

In [50]:
spark.conf.get("spark.executor.cores")

'1'

In [61]:
calendar.explain(True)

== Parsed Logical Plan ==
Relation [listing_id#313L,date#314,available#315,price#316L,adjusted_price#317L,minimum_nights#318L,maximum_nights#319L] com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation@5ff49897

== Analyzed Logical Plan ==
listing_id: bigint, date: date, available: boolean, price: bigint, adjusted_price: bigint, minimum_nights: bigint, maximum_nights: bigint
Relation [listing_id#313L,date#314,available#315,price#316L,adjusted_price#317L,minimum_nights#318L,maximum_nights#319L] com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation@5ff49897

== Optimized Logical Plan ==
Relation [listing_id#313L,date#314,available#315,price#316L,adjusted_price#317L,minimum_nights#318L,maximum_nights#319L] com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation@5ff49897

== Physical Plan ==
*(1) Scan com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation@5ff49897 [listing_id#313L,date#314,available#315,price#316L,adjusted_price#317L,minimum_nights#318L,maximum_

In [76]:
# Listar los nombres de las variables en el entorno global
global_variables = globals()

# Filtrar las variables que son instancias de DataFrame de Spark
dataframes_spark = [(var_name, var_value) for var_name, var_value in global_variables.items() if isinstance(var_value, DataFrame)]

# [k for (k, v) in globals().items() if isinstance(v, DataFrame)]
# Imprimir los nombres de los DataFrames de Spark
for df_name, df_value in dataframes_spark:
    print(f"Nombre del DataFrame: {df_name}")

Nombre del DataFrame: df_spark
Nombre del DataFrame: neighbourhoods
Nombre del DataFrame: listings
Nombre del DataFrame: calendar
Nombre del DataFrame: df_value


In [86]:
[k for (k, v) in globals().items() if isinstance(v, DataFrame)]

['neighbourhoods', 'listings', 'calendar', '_80', 'reviews']

Ejemplo : Encontrar el promedio y la desviación estándar del precio por barrio.

In [98]:
avg_price_by_neighbourhood = listings.groupBy('neighbourhood') \
                                .agg(F.round(F.avg('price'),2).alias('avg_price'), \
                                     F.stddev('price').alias('stddev_price'))

In [100]:
avg_price_by_neighbourhood.limit(5).toPandas()

Unnamed: 0,neighbourhood,avg_price,stddev_price
0,Hellín,237.69,399.573975
1,Butarque,53.09,43.439521
2,Palos de Moguer,107.01,196.608774
3,Rejas,138.32,213.785207
4,Pacífico,89.36,85.406619


In [None]:
Ejemplo : Encontrar la proporción de días ocupados para cada mes.

In [116]:
# Agregar una columna 'month' al DataFrame con la información del calendario
availability_by_month = calendar.withColumn('month', F.month('date'))

# Calcular la ocupación mensual
monthly_occupancy = (
    availability_by_month
    .groupBy('month')
    .agg(
        F.count(F.when(availability_by_month.available == 'f', 1)).alias('occupied_days'),
        F.count('listing_id').alias('total_days')
    )
)

# Calcular la tasa de ocupación mensual
monthly_occupancy = monthly_occupancy.withColumn('occupancy_rate', \ 
                                      monthly_occupancy.occupied_days / monthly_occupancy.total_days)




+-----+-------------+----------+-------------------+
|month|occupied_days|total_days|     occupancy_rate|
+-----+-------------+----------+-------------------+
|   12|       386370|    769637| 0.5020158854109145|
|    1|       370274|    769637| 0.4811021299651654|
|    6|       429154|    744810| 0.5761925860286516|
|    3|       408782|    769637| 0.5311361070218817|
|    5|       408534|    769637| 0.5308138771914552|
|    9|       519826|    719993| 0.7219875748792002|
|    4|       405461|    744810| 0.5443817886440837|
|    8|       446681|    769637| 0.5803788019546877|
|    7|       442155|    769637| 0.5744981075494031|
|   10|       486173|    769637| 0.6316913038224513|
|   11|       349928|    744810|0.46982183375625997|
|    2|       330312|    719983|0.45877749891316877|
+-----+-------------+----------+-------------------+



                                                                                

In [117]:
monthly_occupancy.limit(5).toPandas()

                                                                                

Unnamed: 0,month,occupied_days,total_days,occupancy_rate
0,12,386370,769637,0.502016
1,1,370274,769637,0.481102
2,6,429154,744810,0.576193
3,3,408782,769637,0.531136
4,5,408534,769637,0.530814


Queremos encontrar las ciudades (neighbourhood_group) con un promedio de precio superior a 100$ por noche,
pero solo queremos incluir las ciudades que tienen al menos 100 listados.

In [118]:
result_df = (
    listings
    .groupBy('neighbourhood_group')
    .agg(
        F.avg('price').alias('avg_price'),
        F.count('*').alias('num_listings')
    )
    .filter((F.avg('price') > 100) & (F.count('*') >= 100))
    .orderBy('avg_price', ascending=False)
)

In [122]:
result_df = result_df.persist(StorageLevel.MEMORY_AND_DISK)

24/02/13 02:10:59 WARN CacheManager: Asked to cache already cached data.


In [123]:
result_df.limit(5).toPandas()

Unnamed: 0,neighbourhood_group,avg_price,num_listings
0,San Blas - Canillejas,253.896127,568
1,Salamanca,163.553864,1708
2,Moncloa - Aravaca,141.950292,684
3,Hortaleza,139.68431,529
4,Centro,136.024324,10607


Encuentra el número total de reseñas para cada listado en Madrid. Muestra el nombre del listado y la cantidad de reseñas.

In [92]:
ejer_1 = (
    listings.alias("l")
    .join(reviews.alias("r"), col("l.id") == col("r.listing_id"))
    .groupBy("l.name")
    .agg(F.count("r.listing_id").alias("cantidad_resenas"))
    .orderBy(col("cantidad_resenas").desc())
)

In [91]:
result.limit(3).toPandas()

                                                                                

Unnamed: 0,name,cantidad_resenas
0,Rental unit in Madrid · ★4.68 · 1 bedroom · 1 ...,4895
1,Rental unit in Madrid · ★4.75 · 1 bedroom · 2 ...,4783
2,Rental unit in Madrid · ★4.88 · 1 bedroom · 2 ...,4218


In [128]:
precios_enero = (
    calendar
    .filter(F.month('DATE') == 1)  # Extracción del mes desde la columna 'DATE'
    .groupBy('listing_id')
    .agg(F.avg('price').alias('precio_promedio_enero'))
)

In [130]:
def calcular_precio_promedio_por_mes(spark, dataframe, mes):
    """
    Calcula el precio promedio para un mes específico.

    Parameters:
    - spark: Objeto SparkSession.
    - dataframe: DataFrame de PySpark.
    - mes: Número del mes (1 para enero, 2 para febrero, etc.).

    Returns:
    - DataFrame con listing_id y precio_promedio.
    """
    precios_por_mes = (
        dataframe
        .filter(F.month('DATE') == mes)  # Extracción del mes desde la columna 'DATE'
        .groupBy('listing_id')
        .agg(F.avg('price').alias(f'precio_promedio_mes_{mes}'))
    )
    return precios_por_mes

In [131]:
calcular_precio_promedio_por_mes(spark,calendar, 6).limit(5).toPandas()

                                                                                

Unnamed: 0,listing_id,precio_promedio_mes_6
0,949719529014418407,406.400000
1,23917540,217.866667
2,562980972202430078,15.000000
3,48648725,16.600000
4,49186036,16.000000
...,...,...
24822,589573775731396040,83.600000
24823,888965800879613289,25.000000
24824,34889952,106.000000
24825,47883460,550.000000


In [127]:
calcular_precio_promedio_por_anio_mes(spark, calendar, 2019, 2).show()



+----------+---------------+
|listing_id|precio_promedio|
+----------+---------------+
+----------+---------------+

