In [1]:
# Install pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=a0d3de30545a5ab2acbe868a9d6f72f21fd2dbaba09009a9f097946bcf4e8bce
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
# Part -2
#===============
# Task -1
#===============
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, min, max
from pyspark.sql.functions import avg
import requests
import pandas as pd

# create a SparkSession
spark = SparkSession.builder \
    .appName("AirBnB Data Analysis") \
    .getOrCreate()


def get_data(url = 'https://github.com/databricks/LearningSparkV2/blob/master/mlflow-project-example/data/sf-airbnb-clean.parquet/part-00000-tid-4320459746949313749-5c3d407c-c844-4016-97ad-2edec446aa62-6688-1-c000.snappy.parquet?raw=true'):

    """  
    This function loads the parquet file to Spark Dataframe API.
    Args: 
      url: input parquet file path mapped from github 
    Returns:
      This function returns Spark Dataframe API
    """
    # download the parquet file 
    response = requests.get(url)
    open('sf-airbnb-clean.parquet', 'wb').write(response.content)
    import pandas as pd
    # read the parquet file into a pandas dataframe
    df = pd.read_parquet('sf-airbnb-clean.parquet')
    # save the dataframe as a CSV file
    df.to_csv('sf-airbnb-clean.csv', index=False)

    Spark_df = spark.read.csv("sf-airbnb-clean.csv", header=True, inferSchema=True)
    return Spark_df

spark.stop()


In [3]:
#=================================================================================
#Task - 2 
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, min, max
from pyspark.sql.functions import avg

# create a SparkSession
spark = SparkSession.builder \
    .appName("AirBnB Data Analysis") \
    .getOrCreate()

def write_out_2_2(write_path='out/out_2_2.txt'):

  """  
  This function writes the output by executing functions agg_df.
  Args: 
    write_path: output file path for Part 2 task 2
  Returns:
    None
  """
  agg_df_val = agg_df()
  # write to a CSV file
  agg_df_val.write.csv(write_path, header=True)

def agg_df():

  """  
  This function takes the input by excuting get_data and returns the minimum price, maximum price, and total row count.
  Args: 
    None 
  Returns:
    This functions returns aggrigated values such as minimum price, maximum price, and total row count 
  """

  Spark_df = get_data()
# compute aggregates and store in a new DataFrame
  agg_df_val = Spark_df.agg(min("price").alias("min_price"), 
                  max("price").alias("max_price"), 
                  count("*").alias("row_count"))
  return agg_df_val 

write_out_2_2()

spark.stop()


In [4]:
#====================================================================
# task 3
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, min, max
from pyspark.sql.functions import avg

# create a SparkSession
spark = SparkSession.builder \
    .appName("AirBnB Data Analysis") \
    .getOrCreate()

def write_out_2_3(write_path='out/out_2_3.txt'):

  """  
  This function writes the output out_2_3.txt by executing functions avg_df.
  Args: 
    write_path: output file path for Part 2 task 3
  Returns:
    None
  """

  avg_df_val = avg_df()

    # write to a CSV file
  avg_df_val.write.csv(write_path, header=True)

def avg_df():

  """  
  This function takes the input by excuting get_data
  Args: 
    None 
  Returns:
    This functions returns average number 
    of bathrooms and bedrooms across all the properties listed in this data set with a price of > 5000 
    and a review score being exactly equalt to 10. 
  """

  Spark_df = get_data()
  # filter the relevant rows and compute averages
  avg_df_val = Spark_df.filter((Spark_df.price > 5000) & (Spark_df.review_scores_rating == 10)) \
            .agg(avg("bathrooms").alias("avg_bathrooms"), 
                  avg("bedrooms").alias("avg_bedrooms"))
  return avg_df_val

write_out_2_3()
spark.stop()

In [5]:
# task_4

from pyspark.sql import SparkSession
from pyspark.sql.functions import count, min, max
from pyspark.sql.functions import avg

# create a SparkSession
spark = SparkSession.builder \
    .appName("AirBnB Data Analysis") \
    .getOrCreate()

def write_out_2_4(write_path='out/out_2_4.txt'):
 
  """  
  This function writes the output out_2_4.txt by executing functions capacity.
  Args: 
    write_path: output file path for Part 2 task 4
  Returns:
    None
  """

  capacity_val = capacity()
    # write to a text file
  with open(write_path, "w") as f:
      f.write(str(capacity_val))

def capacity():

  """  
  This function takes the input by excuting get_data
  Args: 
    None 
  Returns:
    This functions returns How many people can be accomodated by the property with the lowest price and highest rating. 
  """

  Spark_df = get_data()
  # sort the DataFrame by price and rating and select the first row
  top_property = Spark_df.orderBy(["price", "review_scores_rating"], ascending=[True, False]).first()
  # compute the maximum capacity
  capacity_val = top_property.bedrooms * top_property.beds
  return capacity_val

write_out_2_4()
spark.stop()