# Spark DataFrame Basics

In [1]:
# Testing pyspark installation
import findspark
findspark.init()
import pyspark
findspark.find()

'C:\\Program Files\\Spark\\spark-3.2.0-bin-hadoop3.2'

In [2]:
# Initiate Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('Basics').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
# start without configuration
# spark = SparkSession.builder.appName('Basics').getOrCreate()

In [23]:
car_models_path = 'Datasets\cars\modelo_carro.csv'
car_models_df = spark.read.option("delimiter", ",") \
                          .option("header", "true").csv(car_models_path)

In [24]:
car_models_df.show()

+--------+-------------+--------+---------+
|id_carro| modelo_carro|   preco|cod_marca|
+--------+-------------+--------+---------+
|       1|       Avalon|78401.95|       54|
|       2|          RDX|95987.38|        1|
|       3|         Golf|61274.55|       55|
|       4|           EX|84981.12|       23|
|       5|       Escort|77466.89|       17|
|       6|   Expedition|84698.71|       17|
|       7|      Voyager|95567.75|       42|
|       8|        Civic|84749.22|       20|
|       9|     Defender|98600.79|       29|
|      10| V8 Vantage S|94791.61|        2|
|      11|          C70|97874.76|       56|
|      12|G-Series 1500|71638.24|       10|
|      13|       Legacy|95850.12|       52|
|      14|          DB9|86707.30|        2|
|      15|     Mulsanne|70453.70|        6|
|      16|           RX|46752.60|       30|
|      17|       Rabbit|78048.08|       55|
|      18|            Q|65193.95|       23|
|      19|          S60|65396.98|       56|
|      20|        Envoy|68994.97

In [25]:
car_models_df.printSchema()

root
 |-- id_carro: string (nullable = true)
 |-- modelo_carro: string (nullable = true)
 |-- preco: string (nullable = true)
 |-- cod_marca: string (nullable = true)



In [7]:
car_models_df.columns

['id_carro', 'modelo_carro', 'preco', 'cod_marca']

In [8]:
car_models_df.describe()

DataFrame[summary: string, id_carro: string, modelo_carro: string, preco: string, cod_marca: string]

In [9]:
from pyspark.sql.types import (StructField, StringType, 
                               IntegerType, StructType, FloatType)

In [26]:
data_schema = [StructField('id_carro', IntegerType(), True),
               StructField('modelo_carro', StringType(), True),
               StructField('preco', FloatType(), True),
               StructField('cod_marca', IntegerType(), True)]

In [27]:
final_struc = StructType(fields=data_schema)

In [28]:
df = spark.read.csv(car_models_path, schema=final_struc)

In [29]:
df.printSchema()

root
 |-- id_carro: integer (nullable = true)
 |-- modelo_carro: string (nullable = true)
 |-- preco: float (nullable = true)
 |-- cod_marca: integer (nullable = true)



In [30]:
type(car_models_df.select('cod_marca'))

pyspark.sql.dataframe.DataFrame

In [31]:
type(car_models_df.cod_marca)

pyspark.sql.column.Column

In [32]:
type(car_models_df.head(2)[0])

pyspark.sql.types.Row

In [33]:
car_models_df.select(['preco', 'cod_marca']).show()

+--------+---------+
|   preco|cod_marca|
+--------+---------+
|78401.95|       54|
|95987.38|        1|
|61274.55|       55|
|84981.12|       23|
|77466.89|       17|
|84698.71|       17|
|95567.75|       42|
|84749.22|       20|
|98600.79|       29|
|94791.61|        2|
|97874.76|       56|
|71638.24|       10|
|95850.12|       52|
|86707.30|        2|
|70453.70|        6|
|46752.60|       30|
|78048.08|       55|
|65193.95|       23|
|65396.98|       56|
|68994.97|       18|
+--------+---------+
only showing top 20 rows



In [34]:
car_models_df.withColumn('double_preco', car_models_df['preco']*2).show()

+--------+-------------+--------+---------+------------+
|id_carro| modelo_carro|   preco|cod_marca|double_preco|
+--------+-------------+--------+---------+------------+
|       1|       Avalon|78401.95|       54|    156803.9|
|       2|          RDX|95987.38|        1|   191974.76|
|       3|         Golf|61274.55|       55|    122549.1|
|       4|           EX|84981.12|       23|   169962.24|
|       5|       Escort|77466.89|       17|   154933.78|
|       6|   Expedition|84698.71|       17|   169397.42|
|       7|      Voyager|95567.75|       42|    191135.5|
|       8|        Civic|84749.22|       20|   169498.44|
|       9|     Defender|98600.79|       29|   197201.58|
|      10| V8 Vantage S|94791.61|        2|   189583.22|
|      11|          C70|97874.76|       56|   195749.52|
|      12|G-Series 1500|71638.24|       10|   143276.48|
|      13|       Legacy|95850.12|       52|   191700.24|
|      14|          DB9|86707.30|        2|    173414.6|
|      15|     Mulsanne|70453.7

In [35]:
car_models_df.show()

+--------+-------------+--------+---------+
|id_carro| modelo_carro|   preco|cod_marca|
+--------+-------------+--------+---------+
|       1|       Avalon|78401.95|       54|
|       2|          RDX|95987.38|        1|
|       3|         Golf|61274.55|       55|
|       4|           EX|84981.12|       23|
|       5|       Escort|77466.89|       17|
|       6|   Expedition|84698.71|       17|
|       7|      Voyager|95567.75|       42|
|       8|        Civic|84749.22|       20|
|       9|     Defender|98600.79|       29|
|      10| V8 Vantage S|94791.61|        2|
|      11|          C70|97874.76|       56|
|      12|G-Series 1500|71638.24|       10|
|      13|       Legacy|95850.12|       52|
|      14|          DB9|86707.30|        2|
|      15|     Mulsanne|70453.70|        6|
|      16|           RX|46752.60|       30|
|      17|       Rabbit|78048.08|       55|
|      18|            Q|65193.95|       23|
|      19|          S60|65396.98|       56|
|      20|        Envoy|68994.97

In [36]:
car_models_df.withColumnRenamed('preco', 'my_new_preco').show()

+--------+-------------+------------+---------+
|id_carro| modelo_carro|my_new_preco|cod_marca|
+--------+-------------+------------+---------+
|       1|       Avalon|    78401.95|       54|
|       2|          RDX|    95987.38|        1|
|       3|         Golf|    61274.55|       55|
|       4|           EX|    84981.12|       23|
|       5|       Escort|    77466.89|       17|
|       6|   Expedition|    84698.71|       17|
|       7|      Voyager|    95567.75|       42|
|       8|        Civic|    84749.22|       20|
|       9|     Defender|    98600.79|       29|
|      10| V8 Vantage S|    94791.61|        2|
|      11|          C70|    97874.76|       56|
|      12|G-Series 1500|    71638.24|       10|
|      13|       Legacy|    95850.12|       52|
|      14|          DB9|    86707.30|        2|
|      15|     Mulsanne|    70453.70|        6|
|      16|           RX|    46752.60|       30|
|      17|       Rabbit|    78048.08|       55|
|      18|            Q|    65193.95|   

In [37]:
car_models_df.createOrReplaceTempView('car')

In [38]:
results = spark.sql("SELECT * FROM car")

In [47]:
results.show()

+--------+-------------+--------+---------+
|id_carro| modelo_carro|   preco|cod_marca|
+--------+-------------+--------+---------+
|       1|       Avalon|78401.95|       54|
|       2|          RDX|95987.38|        1|
|       3|         Golf|61274.55|       55|
|       4|           EX|84981.12|       23|
|       5|       Escort|77466.89|       17|
|       6|   Expedition|84698.71|       17|
|       7|      Voyager|95567.75|       42|
|       8|        Civic|84749.22|       20|
|       9|     Defender|98600.79|       29|
|      10| V8 Vantage S|94791.61|        2|
|      11|          C70|97874.76|       56|
|      12|G-Series 1500|71638.24|       10|
|      13|       Legacy|95850.12|       52|
|      14|          DB9|86707.30|        2|
|      15|     Mulsanne|70453.70|        6|
|      16|           RX|46752.60|       30|
|      17|       Rabbit|78048.08|       55|
|      18|            Q|65193.95|       23|
|      19|          S60|65396.98|       56|
|      20|        Envoy|68994.97

In [40]:
new_results = spark.sql("SELECT * FROM car WHERE cod_marca=15")

In [41]:
new_results.show()

+--------+------------+--------+---------+
|id_carro|modelo_carro|   preco|cod_marca|
+--------+------------+--------+---------+
|     319|      Vision|80349.11|       15|
+--------+------------+--------+---------+

