# Introduction to Spark
* Spark is available both as a standalone installation or packaged with other offerings such as Hadoop
* Please follow the instructios to install it for your environment 
* These examples are run on the Databricks managed platform - Databricks is the original creator of Spark - no extra installation is necessary
* Some commands like display() are convenience functions and run on Databricks, you can always substitute it with show()
### Pre-requisites
* https://spark.apache.org/downloads.html
* https://www.tutorialspoint.com/apache_spark/apache_spark_installation.htm
* pip install pyspark
### Example References
* https://sparkbyexamples.com/pyspark-tutorial/

In [0]:
# Spark Context allows your Spark Application to access Spark Cluster & is the entry point of all Spark functionality
spark

### Spark Data types
* https://spark.apache.org/docs/latest/sql-ref-datatypes.html

In [0]:
#dbutils is a databricks utility function 
dbutils.fs.rm('/tmp/ch1', True)

In [0]:
columns = ["State","Name", "Age"]
data = [("TX","Jack", 25), ("NV","Jane",66), ("CO","Bill",79),("CA","Tom",53), ("WY","Shawn",45)]

age_df = spark.sparkContext.parallelize(data).toDF(columns)
age_df.printSchema()
display(age_df)

State,Name,Age
TX,Jack,25
NV,Jane,66
CO,Bill,79
CA,Tom,53
WY,Shawn,45


### Persist dataframe

In [0]:
age_df.write.format('parquet').save('/tmp/ch1/demographic')

### Create external table

In [0]:
%sql
DROP DATABASE IF EXISTS ch1 CASCADE;

CREATE DATABASE IF NOT EXISTS ch1;

CREATE TABLE IF NOT EXISTS ch1.demographic
USING parquet 
LOCATION '/tmp/ch1/demographic';

### Read data from table

In [0]:
df = spark.read.table('ch1.demographic')
display(df)

State,Name,Age
WY,Shawn,45
CA,Tom,53
NV,Jane,66
TX,Jack,25
CO,Bill,79


In [0]:
%sql
SELECT count(*) from ch1.demographic

count(1)
5


### Analyze data

In [0]:
df.describe().show()

In [0]:
display(df.summary())

summary,State,Name,Age
count,5,5,5.0
mean,,,53.6
stddev,,,20.5621010599598
min,CA,Bill,25.0
25%,,,45.0
50%,,,53.0
75%,,,66.0
max,WY,Tom,79.0


### Transformations

In [0]:
columns = ["FullName", "SSN"]
data = [("Jack", '011-123-2345'), ("Jane",'022-123-2345'), ("Bill",'033-123-2345'),("Tom",'044-123-2345'), ("Shawn",'055-123-2345')]

identity_df = spark.sparkContext.parallelize(data).toDF(columns)
spark.sql("DROP TABLE IF EXISTS ch1.identity")
identity_df.write.format('parquet').saveAsTable('ch1.identity')
identity_df = spark.sql("SELECT * FROM ch1.identity")
display(identity_df)

FullName,SSN
Shawn,055-123-2345
Bill,033-123-2345
Jack,011-123-2345
Tom,044-123-2345
Jane,022-123-2345


#### Filters

In [0]:
age_df.select("Name").show()
age_df.filter(age_df.Name.like('J%')).show()

from pyspark.sql.functions import *
age_df.where(col('Name').like('J%')).show()

#### Add/Drop columns

In [0]:
new_df = age_df.withColumn('newField1', lit('X')).show()

In [0]:
from pyspark.sql.functions import when, col
new_df = age_df.withColumn('newField2', when(col('Name') == 'Jane', 'J').otherwise('Other'))
new_df.show()

In [0]:
new_df.drop('newField2', 'Age').show()

#### Aggregates

In [0]:
age_df.select(count("State").alias('NumStates')).show()
age_df.select(countDistinct("Name", "Age")).show()

age_df.select(avg("Age")).show()
age_df.select(stddev("Age"), sum("Age"), max("Age")).show(truncate=False)

age_df.groupBy('State').max('Age').show()
age_df.orderBy("Name", ascending=False).show()

#### Joins

In [0]:
join_df = age_df.join(identity_df, age_df.Name==identity_df.FullName).show()