<a href="https://colab.research.google.com/github/RVegh/estudos-engenharia-dados/blob/master/pyspark_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install the dependencies
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop3.2.tgz
!tar xf spark-3.1.3-bin-hadoop3.2.tgz
!pip -q install findspark

In [None]:
#Setting up variables and finding Spark with findspark module
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.3-bin-hadoop3.2"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages io.delta:delta-core_2.12:0.7.0 --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog pyspark-shell'
findspark.init()

In [None]:
#Creating SparkSession
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Spark Training').getOrCreate()

In [None]:
path = '/content/bakery_sales.csv'
'''
Dataframe could also be created using read.format(), read.csv,json, etc, createDataFrame or using schema.

Ex:
df_bakery = spark.read.option("inferSchema",True) \
                      .option("header", True) \
                      .csv(path)
'''
df_bakery = (
              spark.read.format("csv") \
                  .option("inferSchema", True) \
                  .option("delimiter", ",") \
                  .option("header", True) \
                  .load(path)
)


In [None]:
#Show schema
df_bakery.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- ticket_number: double (nullable = true)
 |-- article: string (nullable = true)
 |-- Quantity: double (nullable = true)
 |-- unit_price: string (nullable = true)



In [None]:
#Show dataframe
#display(df_bakery)
df_bakery.show()

+---+----------+-----+-------------+--------------------+--------+----------+
|_c0|      date| time|ticket_number|             article|Quantity|unit_price|
+---+----------+-----+-------------+--------------------+--------+----------+
|  0|2021-01-02|08:38|     150040.0|            BAGUETTE|     1.0|    0,90 €|
|  1|2021-01-02|08:38|     150040.0|    PAIN AU CHOCOLAT|     3.0|    1,20 €|
|  4|2021-01-02|09:14|     150041.0|    PAIN AU CHOCOLAT|     2.0|    1,20 €|
|  5|2021-01-02|09:14|     150041.0|                PAIN|     1.0|    1,15 €|
|  8|2021-01-02|09:25|     150042.0|TRADITIONAL BAGUETTE|     5.0|    1,20 €|
| 11|2021-01-02|09:25|     150043.0|            BAGUETTE|     2.0|    0,90 €|
| 12|2021-01-02|09:25|     150043.0|           CROISSANT|     3.0|    1,10 €|
| 15|2021-01-02|09:27|     150044.0|             BANETTE|     1.0|    1,05 €|
| 18|2021-01-02|09:32|     150045.0|TRADITIONAL BAGUETTE|     3.0|    1,20 €|
| 19|2021-01-02|09:32|     150045.0|           CROISSANT|     6.

In [None]:
#Describe dataframe basic statistics 
df_bakery.describe()

DataFrame[summary: string, _c0: string, date: string, time: string, ticket_number: string, article: string, Quantity: string, unit_price: string]

In [None]:
#Show the first x dataframe rows
df_bakery.head(5)

[Row(_c0=0, date='2021-01-02', time='08:38', ticket_number=150040.0, article='BAGUETTE', Quantity=1.0, unit_price='0,90 €'),
 Row(_c0=1, date='2021-01-02', time='08:38', ticket_number=150040.0, article='PAIN AU CHOCOLAT', Quantity=3.0, unit_price='1,20 €'),
 Row(_c0=4, date='2021-01-02', time='09:14', ticket_number=150041.0, article='PAIN AU CHOCOLAT', Quantity=2.0, unit_price='1,20 €'),
 Row(_c0=5, date='2021-01-02', time='09:14', ticket_number=150041.0, article='PAIN', Quantity=1.0, unit_price='1,15 €'),
 Row(_c0=8, date='2021-01-02', time='09:25', ticket_number=150042.0, article='TRADITIONAL BAGUETTE', Quantity=5.0, unit_price='1,20 €')]

In [None]:
#Count number of rows
df_bakery.count()

33951

In [None]:
#Selecting columns
#df_bakery.select(col('date'), col('time')).show()
df_bakery.select('date', 'time').show()

+----------+-----+
|      date| time|
+----------+-----+
|2021-01-02|08:38|
|2021-01-02|08:38|
|2021-01-02|09:14|
|2021-01-02|09:14|
|2021-01-02|09:25|
|2021-01-02|09:25|
|2021-01-02|09:25|
|2021-01-02|09:27|
|2021-01-02|09:32|
|2021-01-02|09:32|
|2021-01-02|09:37|
|2021-01-02|09:37|
|2021-01-02|09:37|
|2021-01-02|09:39|
|2021-01-02|09:40|
|2021-01-02|09:40|
|2021-01-02|09:41|
|2021-01-02|09:46|
|2021-01-02|09:48|
|2021-01-02|09:48|
+----------+-----+
only showing top 20 rows

+----------+-----+
|      date| time|
+----------+-----+
|2021-01-02|08:38|
|2021-01-02|08:38|
|2021-01-02|09:14|
|2021-01-02|09:14|
|2021-01-02|09:25|
|2021-01-02|09:25|
|2021-01-02|09:25|
|2021-01-02|09:27|
|2021-01-02|09:32|
|2021-01-02|09:32|
|2021-01-02|09:37|
|2021-01-02|09:37|
|2021-01-02|09:37|
|2021-01-02|09:39|
|2021-01-02|09:40|
|2021-01-02|09:40|
|2021-01-02|09:41|
|2021-01-02|09:46|
|2021-01-02|09:48|
|2021-01-02|09:48|
+----------+-----+
only showing top 20 rows



In [None]:
df_bakery.select('ticket_number').distinct().count()

19919