## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/listings.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_listing = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_listing)

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,175,30,48,2019-11-04,0.36,2.0,365.0
3831,"Whole flr w/private bdrm, bath & kitchen(pls read)",4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,74,1,375,2020-11-22,4.97,1.0,307.0
5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,30,50,2019-12-02,0.36,1.0,365.0
5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76468,-73.98315,Private room,65,2,474,2020-09-25,3.36,1.0,0.0
5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,75,2,118,2017-07-21,0.86,1.0,0.0
5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,40.66829,-73.98779,Private room,89,4,182,2020-10-17,1.28,3.0,365.0
6848,Only 2 stops to Manhattan studio,15991,Allen & Irina,Brooklyn,Williamsburg,40.70837,-73.95352,Entire home/apt,109,30,181,2020-03-16,1.29,1.0,253.0
6872,Uptown Sanctuary w/ Private Bath (Month to Month),16104,Kae,Manhattan,East Harlem,40.80139,-73.94244,Private room,65,30,0,,,2.0,365.0
6990,UES Beautiful Blue Room,16800,Cyn,Manhattan,East Harlem,40.78962,-73.94802,Private room,62,30,233,2019-12-09,1.72,1.0,365.0
7097,Perfect for Your Parents: Privacy + Garden,17571,Jane,Brooklyn,Fort Greene,40.69121,-73.97277,Entire home/apt,199,2,239,2020-12-05,1.80,2.0,344.0


In [0]:
# File location and type
file_location = "/FileStore/tables/reviews.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_reviews = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_reviews)

listing_id,date
2595,2009-11-21
2595,2009-12-05
2595,2009-12-10
2595,2010-04-09
2595,2010-05-25
2595,2012-05-07
2595,2012-05-17
2595,2012-08-18
2595,2013-05-20
2595,2014-05-21


In [0]:
%sql
select * from listings

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15
id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,175,30,48,2019-11-04,0.36,2,365
3831,"Whole flr w/private bdrm, bath & kitchen(pls read)",4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,74,1,375,2020-11-22,4.97,1,307
5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,30,50,2019-12-02,0.36,1,365
5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76468,-73.98315,Private room,65,2,474,2020-09-25,3.36,1,0
5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,75,2,118,2017-07-21,0.86,1,0
5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,40.66829,-73.98779,Private room,89,4,182,2020-10-17,1.28,3,365
6848,Only 2 stops to Manhattan studio,15991,Allen & Irina,Brooklyn,Williamsburg,40.70837,-73.95352,Entire home/apt,109,30,181,2020-03-16,1.29,1,253
6872,Uptown Sanctuary w/ Private Bath (Month to Month),16104,Kae,Manhattan,East Harlem,40.80139,-73.94244,Private room,65,30,0,,,2,365
6990,UES Beautiful Blue Room,16800,Cyn,Manhattan,East Harlem,40.78962,-73.94802,Private room,62,30,233,2019-12-09,1.72,1,365


In [0]:
%sql
select distinct price from listings

price
"$1,200.00"
2016-08-17
$186.00
$213.00
2016-09-11
2019-09-29
$450.00
2020-09-12
$319.00
$510.00


In [0]:
df_listing.groupBy("room_type","price").agg(({"price":"average"})).show()

In [0]:
%sql
select * from neighbourhoods

_c0,_c1
neighbourhood_group,neighbourhood
Bronx,Allerton
Bronx,Baychester
Bronx,Belmont
Bronx,Bronxdale
Bronx,Castle Hill
Bronx,City Island
Bronx,Claremont Village
Bronx,Clason Point
Bronx,Concourse


In [0]:
%sql
select * from reviews

_c0,_c1
listing_id,date
2595,2009-11-21
2595,2009-12-05
2595,2009-12-10
2595,2010-04-09
2595,2010-05-25
2595,2012-05-07
2595,2012-05-17
2595,2012-08-18
2595,2013-05-20


In [0]:
import pyspark.sql.functions as f
import pyspark.sql.window as Window
df_neigh_count=df_listing.groupBy("neighbourhood").agg(({"neighbourhood":"count"})).withColumnRenamed("count(neighbourhood)","count")
df_neigh_count=df_neigh_count.withColumn('percentage',round((f.col('count')/f.sum('count').over(Window.partitionBy())*100),3))
display(df_neigh_count)

In [0]:
display(df_neigh_count.orderBy('percentage',ascending=False))

In [0]:
df_reviews.groupBy("listing_id").count().explain()   #sparl sql or dataframe have same execution plan.

In [0]:
# Create a view or table

temp_table_name = "reviews_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `reviews_csv`

In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "reviews_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)