# Aula 3 - Consultas e Seleções

In [16]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [17]:
# Spark SQl Consultas e Seleções
df = spark.sql('''select 'OK' as Status''')
df.show()

+------+
|Status|
+------+
|    OK|
+------+



In [18]:
# Importing Data
# df = spark.read.csv('ceral.csv', sep = ',', interShema = True, header = True)
import requests
import pandas as pd
import io

url = "https://raw.githubusercontent.com/SandraRojasZ/Pos_Tech_Data_Analytics/main/Base_de_Dados/cereal.csv"
#df = spark.read.csv('cereal.csv', sep = ',', inferSchema = True, header = True)
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes

# Convert the data to a Pandas DataFrame
data = response.text
df_pandas = pd.read_csv(io.StringIO(data))

In [19]:
df = spark.createDataFrame(df_pandas)

print('df.count :', df.count())
print('df.col ct :', len(df.columns))
print('df.columns:', df.columns)

df.count : 77
df.col ct : 16
df.columns: ['name', 'mfr', 'type', 'calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'shelf', 'weight', 'cups', 'rating']


In [20]:
df.show(3)

+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|             name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|        100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|         All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
only showing top 3 rows



In [21]:
# Manipulation Data with  Spark SQL
# Criado uma tabela temporária a partir do df
df.createOrReplaceTempView("cereal")

In [27]:
cereal = spark.sql('''SELECT COUNT(*) AS total FROM cereal WHERE mfr = 'G' ''')
cereal.show(3)
# cereal.count()

+-----+
|total|
+-----+
|   22|
+-----+



In [28]:
df.where(df['mfr'] == 'G').show(3)
df.where(df['mfr'] == 'G').count()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|Apple Cinnamon Ch...|  G|   C|     110|      2|  2|   180|  1.5| 10.5|    10|    70|      25|    1|   1.0|0.75|29.509541|
|             Basic 4|  G|   C|     130|      3|  2|   210|  2.0| 18.0|     8|   100|      25|    3|  1.33|0.75|37.038562|
|            Cheerios|  G|   C|     110|      6|  2|   290|  2.0| 17.0|     1|   105|      25|    1|   1.0|1.25|50.764999|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
only showing top 3 rows



22

In [30]:
# Visualizando todas as colunas que se tem na tabela
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: long (nullable = true)
 |-- protein: long (nullable = true)
 |-- fat: long (nullable = true)
 |-- sodium: long (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: long (nullable = true)
 |-- potass: long (nullable = true)
 |-- vitamins: long (nullable = true)
 |-- shelf: long (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



In [31]:
df.createOrReplaceTempView("cereal")

In [33]:
# Select no SparkSQL
# Removendo duplicatas -> DISTINCT
cereal = spark.sql('''SELECT DISTINCT name, type, mfr FROM cereal''')
cereal.show(3)

+-------------------+----+---+
|               name|type|mfr|
+-------------------+----+---+
|Frosted Mini-Wheats|   C|  K|
|      Count Chocula|   C|  G|
|            Crispix|   C|  K|
+-------------------+----+---+
only showing top 3 rows



In [37]:
# Removendo duplicatas -> DISTINCT
cereal = spark.sql('''SELECT DISTINCT type, mfr FROM cereal''')
cereal.show()
cereal.count()

+----+---+
|type|mfr|
+----+---+
|   C|  P|
|   C|  Q|
|   C|  N|
|   C|  R|
|   H|  N|
|   C|  G|
|   C|  K|
|   H|  Q|
|   H|  A|
+----+---+



9

In [42]:
# Where no Spark SQL
cereal = spark.sql('''SELECT * FROM cereal WHERE mfr = 'K' AND calories >= 100 ''')
cereal.count()
cereal.show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|         Apple Jacks|  K|   C|     110|      2|  0|   125|  1.0| 11.0|    14|    30|      25|    2|   1.0| 1.0|33.174094|
|         Corn Flakes|  K|   C|     100|      2|  0|   290|  1.0| 21.0|     2|    35|      25|    1|   1.0| 1.0|45.863324|
|           Corn Pops|  K|   C|     110|      1|  0|    90|  1.0| 13.0|    12|    20|      25|    2|   1.0| 1.0|35.782791|
|  Cracklin' Oat Bran|  K|   C|     110|      3|  3|   140|  4.0| 10.0|     7|   160|      25|    3|   1.0| 0.5|40.448772|
|             Crispix|  K|   C|     110|      2|  0|   220|  1.0| 21.0|     3|    30|      25|    3|   1.0| 1.0|46.895644|
|         Froot 

In [49]:
#  Group BY
# coutn - > funçao nativa do SQL
cereal = spark.sql(''' SELECT mfr, \
                              type, \
                              COUNT(*) AS total,
                              SUM(calories) AS Total_Calories
                        FROM cereal
                        GROUP BY \
                              mfr, \
                              type ''')
# count -> função nativa do Python
cereal.count()
cereal.show()

+---+----+-----+--------------+
|mfr|type|total|Total_Calories|
+---+----+-----+--------------+
|  P|   C|    9|           980|
|  K|   C|   23|          2500|
|  G|   C|   22|          2450|
|  Q|   C|    7|           660|
|  R|   C|    8|           920|
|  N|   H|    1|           100|
|  N|   C|    5|           420|
|  A|   H|    1|           100|
|  Q|   H|    1|           100|
+---+----+-----+--------------+



In [48]:
cereal = spark.sql(''' SELECT DISTINCT type
                       FROM cereal ''')
cereal.show()

+----+
|type|
+----+
|   C|
|   H|
+----+



In [53]:
# Case WHEN
cereal = spark.sql(''' SELECT mfr, \
                              type, \
                              (case
                              -- alterando as letras
                                    when type = 'C' then 'A'
                                    when type = 'H' then 'B'
                                    else 'C'
                              end) as type_new,
                              COUNT(*) AS total,
                              SUM(calories) AS Total_Calories
                        FROM cereal
                        GROUP BY \
                              mfr, \
                              type ''')
# count -> função nativa do Python
cereal.count()
cereal.show()

+---+----+--------+-----+--------------+
|mfr|type|type_new|total|Total_Calories|
+---+----+--------+-----+--------------+
|  P|   C|       A|    9|           980|
|  K|   C|       A|   23|          2500|
|  G|   C|       A|   22|          2450|
|  Q|   C|       A|    7|           660|
|  R|   C|       A|    8|           920|
|  N|   H|       B|    1|           100|
|  N|   C|       A|    5|           420|
|  A|   H|       B|    1|           100|
|  Q|   H|       B|    1|           100|
+---+----+--------+-----+--------------+

