# Getting Started with Spark

### Reading the Titanic dataset

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("TitanicData").getOrCreate()

24/01/31 09:25:00 WARN Utils: Your hostname, Neylsons-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.3.25 instead (on interface en0)
24/01/31 09:25:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/31 09:25:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.sparkContext.setLogLevel("ERROR")

In [5]:
titanic = (
    spark
    .read
    .options(header=True, inferSchema=True, delimiter=";")
    .csv('data/titanic/titanic.csv')
)

In [6]:
titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
filtered = (
    titanic
    .filter(titanic.Age > 21)
    .filter(titanic.Sex == "male")
)

In [8]:
filtered.show()

+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|male|22.0|    1|    0| A/5 21171|   7.25| null|       S|
|          5|       0|     3|Allen, Mr. Willia...|male|35.0|    0|    0|    373450|   8.05| null|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|male|54.0|    0|    0|     17463|51.8625|  E46|       S|
|         14|       0|     3|Andersson, Mr. An...|male|39.0|    1|    5|    347082| 31.275| null|       S|
|         21|       0|     2|Fynney, Mr. Joseph J|male|35.0|    0|    0|    239865|   26.0| null|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|male|34.0|    0|    0|    248698|   13.0|  D56|       S|
|         24|       1|     1|Sloper, 

In [9]:
queryp = (
    titanic
    .filter(titanic.Sex == "male")
    .filter(titanic.Age > 21)
    .groupBy('Pclass')
    .agg(f.sum('Survived').alias('Survivors'))
)

In [10]:
titanic.createOrReplaceTempView('titanic')

querysql = spark.sql("""
    SELECT 
        Pclass,
        sum(Survived) as Survivors
    FROM titanic
    WHERE
        Sex = 'male'
        AND Age > 21
    GROUP BY Pclass
""")

In [11]:
queryp.explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (7)
+- HashAggregate (6)
   +- Exchange (5)
      +- HashAggregate (4)
         +- Project (3)
            +- Filter (2)
               +- Scan csv  (1)


(1) Scan csv 
Output [4]: [Survived#18, Pclass#19, Sex#21, Age#22]
Batched: false
Location: InMemoryFileIndex [file:/Users/neylsoncrepalde/Bigdata-on-Kubernetes/Chapter 5/data/titanic/titanic.csv]
PushedFilters: [IsNotNull(Sex), IsNotNull(Age), EqualTo(Sex,male), GreaterThan(Age,21.0)]
ReadSchema: struct<Survived:int,Pclass:int,Sex:string,Age:double>

(2) Filter
Input [4]: [Survived#18, Pclass#19, Sex#21, Age#22]
Condition : (((isnotnull(Sex#21) AND isnotnull(Age#22)) AND (Sex#21 = male)) AND (Age#22 > 21.0))

(3) Project
Output [2]: [Survived#18, Pclass#19]
Input [4]: [Survived#18, Pclass#19, Sex#21, Age#22]

(4) HashAggregate
Input [2]: [Survived#18, Pclass#19]
Keys [1]: [Pclass#19]
Functions [1]: [partial_sum(Survived#18)]
Aggregate Attributes [1]: [sum#123L]
Results [2]: [Pclass#19, sum#124L]

In [12]:
querysql.explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (7)
+- HashAggregate (6)
   +- Exchange (5)
      +- HashAggregate (4)
         +- Project (3)
            +- Filter (2)
               +- Scan csv  (1)


(1) Scan csv 
Output [4]: [Survived#18, Pclass#19, Sex#21, Age#22]
Batched: false
Location: InMemoryFileIndex [file:/Users/neylsoncrepalde/Bigdata-on-Kubernetes/Chapter 5/data/titanic/titanic.csv]
PushedFilters: [IsNotNull(Sex), IsNotNull(Age), EqualTo(Sex,male), GreaterThan(Age,21.0)]
ReadSchema: struct<Survived:int,Pclass:int,Sex:string,Age:double>

(2) Filter
Input [4]: [Survived#18, Pclass#19, Sex#21, Age#22]
Condition : (((isnotnull(Sex#21) AND isnotnull(Age#22)) AND (Sex#21 = male)) AND (Age#22 > 21.0))

(3) Project
Output [2]: [Survived#18, Pclass#19]
Input [4]: [Survived#18, Pclass#19, Sex#21, Age#22]

(4) HashAggregate
Input [2]: [Survived#18, Pclass#19]
Keys [1]: [Pclass#19]
Functions [1]: [partial_sum(Survived#18)]
Aggregate Attributes [1]: [sum#125L]
Results [2]: [Pclass#19, sum#126L]

In [13]:
queryp.show()

+------+---------+
|Pclass|Survivors|
+------+---------+
|     1|       36|
|     3|       22|
|     2|        5|
+------+---------+



In [14]:
querysql.show()

+------+---------+
|Pclass|Survivors|
+------+---------+
|     1|       36|
|     3|       22|
|     2|        5|
+------+---------+



In [15]:
titanic.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      