
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/laptop_price.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,400.0
7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97
8,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,1158.7
9,Asus,ZenBook UX430UN,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3kg,1495.0
10,Acer,Swift 3,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6kg,770.0


In [0]:
# Create a view or table

temp_table_name = "laptop_price_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `laptop_price_csv`

laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,400.0
7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97
8,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,1158.7
9,Asus,ZenBook UX430UN,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3kg,1495.0
10,Acer,Swift 3,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6kg,770.0


In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "laptop_price_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [0]:
import pyspark,math
from pyspark.sql.functions import *
from pyspark.sql import functions as F

In [0]:
df.printSchema()

root
 |-- laptop_ID: integer (nullable = true)
 |-- Company: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- TypeName: string (nullable = true)
 |-- Inches: double (nullable = true)
 |-- ScreenResolution: string (nullable = true)
 |-- Cpu: string (nullable = true)
 |-- Ram: string (nullable = true)
 |-- Memory: string (nullable = true)
 |-- Gpu: string (nullable = true)
 |-- OpSys: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Price_euros: double (nullable = true)



In [0]:
#Retrieve all information for laptops manufactured by a specific company i.e. Dell
s1 = df.filter(col('Company')=='Dell').display()

laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
14,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,256GB SSD,AMD Radeon R5 M430,Windows 10,2.2kg,498.9
17,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,256GB SSD,AMD Radeon R5 M430,Windows 10,2.2kg,745.0
20,Dell,XPS 13,Ultrabook,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,128GB SSD,Intel UHD Graphics 620,Windows 10,1.22kg,979.0
24,Dell,Inspiron 5379,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.62kg,819.0
26,Dell,Inspiron 3567,Notebook,15.6,1366x768,Intel Core i3 6006U 2GHz,4GB,1TB HDD,Intel HD Graphics 520,Windows 10,2.3kg,418.64
28,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,AMD Radeon 530,Windows 10,2.2kg,800.0
29,Dell,Latitude 5590,Ultrabook,15.6,Full HD 1920x1080,Intel Core i7 8650U 1.9GHz,8GB,256GB SSD + 256GB SSD,Intel UHD Graphics 620,Windows 10,1.88kg,1298.0
34,Dell,XPS 13,Ultrabook,13.3,Touchscreen / Quad HD+ 3200x1800,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Intel UHD Graphics 620,Windows 10,1.2kg,1869.0
38,Dell,Inspiron 5770,Notebook,17.3,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,128GB SSD + 1TB HDD,AMD Radeon 530,Windows 10,2.8kg,979.0
42,Dell,Inspiron 7577,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1060,Windows 10,2.65kg,1499.0


In [0]:
#Calculate the average price of laptops for each company
s2 = df.groupBy(col("Company")).agg(avg('Price_euros').alias("Avg Price")).display()

Company,Avg Price
Razer,3346.1428571428573
Fujitsu,729.0
Huawei,1424.0
Xiaomi,1133.4625
HP,1067.774854014598
Dell,1186.0689898989892
Vero,217.425
Acer,626.7758252427185
Asus,1104.1693670886075
Lenovo,1086.3844444444444


In [0]:
#Find laptops priced above 1500 euros
s3 = df.filter(col("Price_euros")>1500).select('laptop_ID','Company', 'Product').display()

laptop_ID,Company,Product
4,Apple,MacBook Pro
5,Apple,MacBook Pro
7,Apple,MacBook Pro
13,Apple,MacBook Pro
16,Apple,MacBook Pro
18,Apple,MacBook Pro
34,Dell,XPS 13
59,MSI,GS73VR 7RG
67,Asus,ZenBook Pro
83,Apple,"MacBook 12"""""


In [0]:
#Count the number of laptops for each operating system
s4 = df.groupBy(col('OpSys')).count().display()

OpSys,count
Windows 10,1072
Linux,62
macOS,13
Chrome OS,27
Android,2
Windows 10 S,8
Windows 7,45
Mac OS X,8
No OS,66


In [0]:
#Find laptops with a specific screen resolution
s5 = df.filter(col('ScreenResolution')=='1920x1080').select('Laptop_ID').display()

Laptop_ID
671
788
800


In [0]:
#Display laptops priced above 1500 euros, sorted by price in descending order
s6 = df.filter(col('Price_euros')>1500).orderBy(col("Price_euros").desc()).select('Laptop_ID','Company','Product').show()

+---------+-------+------------------+
|Laptop_ID|Company|           Product|
+---------+-------+------------------+
|      200|  Razer|         Blade Pro|
|      839|  Razer|         Blade Pro|
|      617| Lenovo|      Thinkpad P51|
|      758|     HP|          Zbook 17|
|     1081|   Asus|        ROG G701VO|
|     1151|     HP|          ZBook 17|
|      243|   Asus| ROG G703VI-E5062T|
|      731|   Dell|      Alienware 17|
|      789|   Dell|      Alienware 17|
|     1249|  Razer|         Blade Pro|
|      752| Lenovo|     Thinkpad P51s|
|     1096| Lenovo|IdeaPad Y900-17ISK|
|      969|   Dell|      Alienware 17|
|      982|   Dell|      Alienware 17|
|      667|   Dell|      Alienware 17|
|      924|     HP|   Elitebook Folio|
|      851|   Dell|      Alienware 17|
|      208|   Dell|    Precision 7520|
|      537|   Dell|      Alienware 17|
|      252|   Asus|Rog G701VIK-BA060T|
+---------+-------+------------------+
only showing top 20 rows



In [0]:
#Find laptops with Intel CPUs and more than 16GB of RAM
s7 = df.filter((col('Gpu').rlike('Intel')) & (col('Ram')=='12GB')).select('Product').show()

+--------------------+
|             Product|
+--------------------+
|       Inspiron 5567|
|SP315-51 (i7-7500...|
|      Thinkpad T460s|
|         ZenBook Pro|
|      Thinkpad T460s|
|  Ideapad 320-15IKBR|
|       Inspiron 3567|
|       Inspiron 7579|
|       Inspiron 7378|
|      ThinkPad T460s|
+--------------------+



In [0]:
#Find the top 5 most expensive laptops
s8 = df.groupBy(col('Product')).agg(sum(col("Price_euros"))).orderBy(
    sum(col('Price_euros')).desc()).limit(10).show()

+------------------+------------------+
|           Product|  sum(Price_euros)|
+------------------+------------------+
|            XPS 13|           49847.0|
|      Alienware 17|          43716.58|
|       ThinkPad X1|          27024.64|
|         Blade Pro|           20595.0|
|       MacBook Pro|20055.129999999997|
|Legion Y520-15IKBN|           19837.0|
|            XPS 15|           19010.1|
|     EliteBook 840|17842.370000000003|
|     Inspiron 5570|16231.949999999999|
|     Inspiron 3567|          16197.94|
+------------------+------------------+



In [0]:
#Display laptops with Windows OS and a screen size greater than 13 inches
s9 = df.filter((col('OpSys')=='Windows 10') &
               (col('Inches')>13)).select('Laptop_ID').show()

+---------+
|Laptop_ID|
+---------+
|        6|
|        9|
|       10|
|       14|
|       17|
|       20|
|       22|
|       24|
|       25|
|       26|
|       28|
|       29|
|       30|
|       31|
|       32|
|       33|
|       34|
|       36|
|       38|
|       39|
+---------+
only showing top 20 rows

