### Airbnb with Pyspark

##### Creating a Spark Session 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName("Unpacking Airbnb data")\
    .getOrCreate()

26/01/31 09:18:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


#### Importing the listing data 

In [2]:
listings = spark.read.csv("../data/listings.csv.gz",
            header=True,
            inferSchema=True, ## to use the data actual schema
            sep = ",",
            quote = '"' ,
            escape = '"',
            multiLine= True,
            mode = "PERMISSIVE"
) 

                                                                                

#### Getting the schema of the listing data

In [3]:
for list in listings.schema:
    print(list)

StructField('id', LongType(), True)
StructField('listing_url', StringType(), True)
StructField('scrape_id', LongType(), True)
StructField('last_scraped', DateType(), True)
StructField('source', StringType(), True)
StructField('name', StringType(), True)
StructField('description', StringType(), True)
StructField('neighborhood_overview', StringType(), True)
StructField('picture_url', StringType(), True)
StructField('host_id', IntegerType(), True)
StructField('host_url', StringType(), True)
StructField('host_name', StringType(), True)
StructField('host_since', DateType(), True)
StructField('host_location', StringType(), True)
StructField('host_about', StringType(), True)
StructField('host_response_time', StringType(), True)
StructField('host_response_rate', StringType(), True)
StructField('host_acceptance_rate', StringType(), True)
StructField('host_is_superhost', StringType(), True)
StructField('host_thumbnail_url', StringType(), True)
StructField('host_picture_url', StringType(), True)


In [4]:
neighbourhoods = listings.select(listings.neighbourhood_cleansed)
neighbourhoods.show(20, truncate=False)

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|Islington             |
|Kensington and Chelsea|
|Westminster           |
|Wandsworth            |
|Tower Hamlets         |
|Richmond upon Thames  |
|Haringey              |
|Hammersmith and Fulham|
|Hammersmith and Fulham|
|Southwark             |
|Westminster           |
|Barnet                |
|Hounslow              |
|Southwark             |
|Waltham Forest        |
|Barnet                |
|Hammersmith and Fulham|
|Hammersmith and Fulham|
|Brent                 |
|Camden                |
+----------------------+
only showing top 20 rows


In [5]:
review_locations = listings.select(listings.review_scores_rating)
review_locations.show()

+--------------------+
|review_scores_rating|
+--------------------+
|                4.85|
|                 4.8|
|                4.77|
|                 4.9|
|                4.53|
|                 4.8|
|                4.87|
|                4.77|
|                4.83|
|                4.97|
|                 4.5|
|                 4.5|
|                 4.5|
|                 4.7|
|                 4.5|
|                4.89|
|                NULL|
|                4.74|
|                4.88|
|                 5.0|
+--------------------+
only showing top 20 rows


In [6]:
listings \
    .select(listings.review_scores_location)\
    .show()

+----------------------+
|review_scores_location|
+----------------------+
|                  4.78|
|                  4.93|
|                  4.89|
|                   4.6|
|                  4.85|
|                   4.9|
|                  4.77|
|                  4.53|
|                  4.79|
|                  4.79|
|                   4.5|
|                  4.64|
|                  4.84|
|                  4.86|
|                   4.0|
|                  4.75|
|                  NULL|
|                  4.66|
|                  4.67|
|                   5.0|
+----------------------+
only showing top 20 rows


##### SHOWING THE LOCATIONS WITH THE BEST RATINGS

In [7]:
high_score_listings = listings \
        .filter(listings.review_scores_location > 4.5)\
        .select('id','price','name','review_scores_location','review_scores_rating')

high_score_listings.show(20, truncate=False)

+-----+-------+-------------------------------------------------+----------------------+--------------------+
|id   |price  |name                                             |review_scores_location|review_scores_rating|
+-----+-------+-------------------------------------------------+----------------------+--------------------+
|13913|$70.00 |Holiday London DB Room Let-on going              |4.78                  |4.85                |
|15400|$149.00|Bright Chelsea  Apartment. Chelsea!              |4.93                  |4.8                 |
|17402|$411.00|Very Central Modern 3-Bed/2 Bath By Oxford St W1 |4.89                  |4.77                |
|24328|NULL   |Battersea live/work artist house                 |4.6                   |4.9                 |
|36274|$210.00|Bright 1 bedroom apt off brick lane in Shoreditch|4.85                  |4.53                |
|36299|$280.00|Kew Gardens 3BR house in cul-de-sac              |4.9                   |4.8                 |
|36660|$90

##### REMOVING NULL VALUES FROM THE HIGHSCORE LISTING INSTANCE

In [8]:
high_score_listings.dropna().show(20, truncate=False)

+-----+-------+--------------------------------------------------+----------------------+--------------------+
|id   |price  |name                                              |review_scores_location|review_scores_rating|
+-----+-------+--------------------------------------------------+----------------------+--------------------+
|13913|$70.00 |Holiday London DB Room Let-on going               |4.78                  |4.85                |
|15400|$149.00|Bright Chelsea  Apartment. Chelsea!               |4.93                  |4.8                 |
|17402|$411.00|Very Central Modern 3-Bed/2 Bath By Oxford St W1  |4.89                  |4.77                |
|36274|$210.00|Bright 1 bedroom apt off brick lane in Shoreditch |4.85                  |4.53                |
|36299|$280.00|Kew Gardens 3BR house in cul-de-sac               |4.9                   |4.8                 |
|36660|$90.00 |You are GUARANTEED to love this                   |4.77                  |4.87                |
|

##### CONVERTING THE PRICES TO FLOAT FROM STRING VALUES ATTACHED USING REGEX

In [9]:
from pyspark.sql.functions import regexp_replace
price_num_df = listings \
    .withColumn('price_num',regexp_replace('price','[$,]','').cast('float'))
price_num_df.schema['price_num']

StructField('price_num', FloatType(), True)

In [10]:
price_num_df\
    .select('price_num','name','review_scores_rating')\
    .show(20, truncate=False)

+---------+-------------------------------------------------+--------------------+
|price_num|name                                             |review_scores_rating|
+---------+-------------------------------------------------+--------------------+
|70.0     |Holiday London DB Room Let-on going              |4.85                |
|149.0    |Bright Chelsea  Apartment. Chelsea!              |4.8                 |
|411.0    |Very Central Modern 3-Bed/2 Bath By Oxford St W1 |4.77                |
|NULL     |Battersea live/work artist house                 |4.9                 |
|210.0    |Bright 1 bedroom apt off brick lane in Shoreditch|4.53                |
|280.0    |Kew Gardens 3BR house in cul-de-sac              |4.8                 |
|90.0     |You are GUARANTEED to love this                  |4.87                |
|61.0     |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |4.77                |
|340.0    |Short Term Home                                  |4.83                |
|49.

#### SEARCHING FOR GOOD AND CHEAP LOCATIONS IN THE LISTING DATA

In [11]:
from pyspark.sql import functions as F

rare_gems = (
    price_num_df
    .filter((F.col("price_num") < 80) & (F.col("review_scores_rating") > 4))
    .select("id", F.col("price_num").alias("PRICE"), "name", "review_scores_rating")
)

rare_gems.show(20, truncate=False)

+------+-----+-------------------------------------------------+--------------------+
|id    |PRICE|name                                             |review_scores_rating|
+------+-----+-------------------------------------------------+--------------------+
|13913 |70.0 |Holiday London DB Room Let-on going              |4.85                |
|38605 |61.0 |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |4.77                |
|38995 |49.0 |SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT         |4.97                |
|42010 |71.0 |You Will Save Money Here                         |4.89                |
|43129 |48.0 |Quiet Comfortable Room in Fulham                 |4.74                |
|45163 |76.0 |Room with a garden                               |5.0                 |
|47192 |50.0 |Pleasant Single Room in zone 1.                  |4.81                |
|47687 |78.0 |Cosy Double studio in Zone 2 Hammersmith (6)     |4.73                |
|49970 |79.0 |Beautiful Small Studio Hammersmith      

In [12]:
listings\
    .select(listings.property_type,listings.room_type)\
    .distinct()\
    .show(truncate=False)

                                                                                

##### GETTING HOUSES THAT HAVE NO PICTURE URL AND THEIR PERFORMANCE IN TERMS OF RATING

In [13]:
listings.select(
    F.min("review_scores_rating").alias("min"),
    F.max("review_scores_rating").alias("max")
).show()


                                                                                

In [19]:
listings\
 .filter(
     (F.col("picture_url").isNull() | (F.trim(F.col("picture_url")) == ""))
     ##(F.col("review_scores_rating") < 3)
 )\
.select("id", "name", "room_type",F.col("picture_url").alias("URL"), "review_scores_rating")\
 .show(truncate=False)



                                                                                

##### NUMBER OF PROPERTIES THAT GET MORE THAN 10 REVIEWS PER MONTH

In [24]:
listings\
    .filter(F.col("reviews_per_month") > 10) \
    .select("id","name","room_type","reviews_per_month") .distinct() \
    .show(truncate=False)

                                                                                

##### Properties that have more bathrooms than bedrooms and their review ratings

In [27]:
listings\
    .filter(F.col("bathrooms")>F.col("bedrooms"))\
    .select("id","name","room_type","bathrooms","bedrooms","review_scores_rating")\
    .show(truncate=False)


+------+-------------------------------------------------+---------------+---------+--------+--------------------+
|id    |name                                             |room_type      |bathrooms|bedrooms|review_scores_rating|
+------+-------------------------------------------------+---------------+---------+--------+--------------------+
|56229 |Cosy Double studio in Zone 2 Hammersmith (1)     |Entire home/apt|1.5      |1       |4.71                |
|81951 |LONDON DETACHED HOUSE*ElecGates etc              |Entire home/apt|2.0      |1       |NULL                |
|84223 |Designer room Park Views 4 mins zone 1 station   |Private room   |1.5      |1       |4.79                |
|85191 |Maisonette in Central London Zone 1              |Private room   |1.5      |1       |4.86                |
|157714|West London,loft ensuite, 5min2tube              |Private room   |1.5      |1       |4.7                 |
|227502|Shoreditch Loft                                  |Entire home/apt|1.5   

##### PROPERTIES WITH PRICES GREATER THAN PRICES THAN 5,000

In [31]:
price_num_df\
    .filter(F.col("price_num")>5000)\
    .select("id","price","name","review_scores_rating")\
    .show(10,truncate=False)

+--------+----------+--------------------------------------------------+--------------------+
|id      |price     |name                                              |review_scores_rating|
+--------+----------+--------------------------------------------------+--------------------+
|9470827 |$8,000.00 |Room in a cosy flat. Central, clean               |4.75                |
|10475894|$6,309.00 |Spacious Private Ground Floor Room                |NULL                |
|13254774|$53,588.00|No Longer Available                               |4.85                |
|13841484|$74,100.00|Bright & airy DoubleBed with EnSuite in Zone 2!   |4.89                |
|36304412|$7,377.00 |The Apartments by The Sloane Club, Two Bedroom Apt|NULL                |
|36304540|$7,377.00 |The Apartments by The Sloane Club, L 2 Bedroom Apt|NULL                |
|40881056|$6,523.00 |Single room. 7ft x 9ft - Over looking garden      |NULL                |
|41559554|$6,666.00 |Close To London Eye (TUR)              

In [37]:
(price_num_df
 .filter(
     (F.col("price_num") < 150) &
     (F.col("review_scores_rating") > 4.5) &
     (F.col("number_of_reviews") > 20)
 )
 .select("id","name","price","review_scores_rating","number_of_reviews") \
 .show(truncate=False)
)


+-----+-------------------------------------------------+-------+--------------------+-----------------+
|id   |name                                             |price  |review_scores_rating|number_of_reviews|
+-----+-------------------------------------------------+-------+--------------------+-----------------+
|13913|Holiday London DB Room Let-on going              |$70.00 |4.85                |55               |
|15400|Bright Chelsea  Apartment. Chelsea!              |$149.00|4.8                 |97               |
|36660|You are GUARANTEED to love this                  |$90.00 |4.87                |730              |
|38605|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |$61.00 |4.77                |387              |
|38995|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT         |$49.00 |4.97                |72               |
|41712|Room with a view, shared flat,  central  Bankside|$96.00 |4.7                 |137              |
|42010|You Will Save Money Here                        

In [40]:
listings \
    .groupby(listings.property_type) \
    .count() \
    .show(truncate=False)

                                                                                