# Aggregating DataFrames in PySpark HW

First let's start up our PySpark instance

In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

spark = SparkSession.builder.master("local[*]").appName("AggHW").getOrCreate()

spark

## Read in the dataFrame for this Notebook

In [3]:
airbnb = spark.read.csv('Datasets/nyc_air_bnb.csv', inferSchema=True, header=True)

## About this dataset

This dataset describes the listing activity and metrics for Air BNB bookers in NYC, NY for 2019. Each line in the dataset is a booking. 

**Source:** https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data

Let's go ahead and view the first few records of the dataset so we know what we are working with.

In [4]:
airbnb.limit(5).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


Now print the schema so we can make sure all the variables have the correct types

In [5]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



Notice here that some of the columns that are obviously numeric have been incorrectly identified as "strings". Let's edit that. Otherwise we cannot aggregate any of the numeric columns.

In [7]:
df = airbnb.withColumn(
    "price",
    F.col("price").cast(T.FloatType())
).withColumn(
    "minimum_nights",
    F.col("minimum_nights").cast(T.IntegerType())
).withColumn(
    "number_of_reviews",
    F.col("number_of_reviews").cast(T.IntegerType())
).withColumn(
    "reviews_per_month",
    F.col("reviews_per_month").cast(T.FloatType())
).withColumn(
    "calculated_host_listings_count",
    F.col("calculated_host_listings_count").cast(T.IntegerType())
)

df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: float (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: float (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)



### Alright now we are ready to dig in!


### 1. How many rows are in this dataset?

In [9]:
df.count()

49079

### 2. How many total reviews does each host have?

In [38]:
(
    df.
    groupBy(
        "host_id",
        "host_name"
    )
    .agg(
        F.sum("number_of_reviews").alias("number_of_reviews")
    )
    .orderBy(
        F.col("number_of_reviews").desc()
    )
    .toPandas()
)

Unnamed: 0,host_id,host_name,number_of_reviews
0,37312959,Maya,2273.0
1,344035,Brooklyn& Breakfast -Len-,2205.0
2,26432133,Danielle,2017.0
3,35524316,Yasu & Akiko,1971.0
4,40176101,Brady,1818.0
...,...,...,...
37485,Bozo,Manhattan,
37486,Joselyn,Queens,
37487,Anting,Brooklyn,
37488,Cederna,Queens,


### 3. Show the min and max of all the numeric variables in the dataset

In [16]:
(
    df
    .select(
        "price",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365"
    )
    .summary(
        "min",
        "max"
    )
    .toPandas()
)

Unnamed: 0,summary,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,min,-74.00828,0,0,0.0,0,0
1,max,10000.0,1250,629,58.5,365,365


### 4. Which host had the highest number of reviews?

Only display the top result.

Bonus: format the column names

In [40]:
(
    df
    .groupBy(
        "host_id",
        "host_name"
    )
    .agg(
        F.sum("number_of_reviews").alias("number_of_reviews")
    )
    .orderBy(
        F.col("number_of_reviews").desc()
    )
    .limit(1)
    .toPandas()
)

Unnamed: 0,host_id,host_name,number_of_reviews
0,37312959,Maya,2273


### 5. On average, how many nights did most hosts specify for a minimum?

In [41]:
(
    df
    .agg(
        F.avg("minimum_nights").alias("Avg. Min. Nights")
    )
    .toPandas()
)

Unnamed: 0,Avg. Min. Nights
0,7.128613


### 6. What is the most expensive neighborhood to stay in on average?

Note: only show the one result

In [43]:
(
    df
    .select(
        "neighbourhood",
        "price"
    )
    .groupBy(
        "neighbourhood"
    )
    .agg(
        F.avg("price").alias("avg_price")
    )
    .orderBy(
        F.col("avg_price").desc()
    )
    .limit(1)
    .toPandas()
)

Unnamed: 0,neighbourhood,avg_price
0,Fort Wadsworth,800.0


### 7. Display a two by two table that shows the average prices by room type (private and shared only) and neighborhood group (Manhattan and Brooklyn only)

In [34]:
df.toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647491,-73.972366,Private room,149.0,1.0,9.0,2018-10-19,0.21,6.0,365.0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.753620,-73.983772,Entire home/apt,225.0,1.0,45.0,2019-05-21,0.38,2.0,355.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.809021,-73.941902,Private room,150.0,3.0,0.0,,,1.0,365.0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.685139,-73.959763,Entire home/apt,89.0,1.0,270.0,2019-07-05,4.64,1.0,194.0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.798512,-73.943993,Entire home/apt,80.0,10.0,9.0,2018-11-19,0.10,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49074,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.678532,-73.949951,Private room,70.0,2.0,0.0,,,2.0,9.0
49075,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.701839,-73.933167,Private room,40.0,4.0,0.0,,,2.0,36.0
49076,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.814751,-73.948669,Entire home/apt,115.0,10.0,0.0,,,1.0,27.0
49077,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.757511,-73.991119,Shared room,55.0,1.0,0.0,,,6.0,2.0


### Alright that's all folks!

### Great job!

In [44]:
(
    df
    .groupBy(
        "neighbourhood_group"
    )
    .pivot(
        pivot_col="room_type",
        values=["Private room", "Shared room"]
    )
    .agg(
        F.avg("price")
    )
    .where(
        F.col("neighbourhood_group").isin(
            "Manhattan", "Brooklyn"
        )
    )
    .toPandas()

)

Unnamed: 0,neighbourhood_group,Private room,Shared room
0,Brooklyn,76.47234,50.527845
1,Manhattan,116.054003,89.069038
