# Setup

In [2]:
!pip install findspark

Collecting findspark
  Downloading findspark-1.4.2-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [3]:
import findspark
findspark.init()

In [33]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [30]:
!python --version

Python 3.7.10


In [34]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.2
      /_/
                        
Using Scala version 2.12.10, Eclipse OpenJ9 VM, 1.8.0_252
Branch HEAD
Compiled by user centos on 2021-02-16T04:53:13Z
Revision 648457905c4ea7d00e3d88048c63f360045f0714
Url https://gitbox.apache.org/repos/asf/spark.git
Type --help for more information.


# Spark Context and Spark Session

### Creating the spark session and context

In [8]:
sc = SparkContext()

spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### Initialize spark session

In [9]:
spark

# RDDs

### Create an RDD

In [10]:
data = range(1, 30)

print(data[0])
print(len(data))

xrangeRDD = sc.parallelize(data, 4)

xrangeRDD

1
29


PythonRDD[1] at RDD at PythonRDD.scala:53

### Transformations

In [12]:
subRDD = xrangeRDD.map(lambda x: x-1)
filteredRDD = subRDD.filter(lambda x : x<10)

### Actions

In [15]:
print(filteredRDD.collect())
print(filteredRDD.count())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10


In [16]:
print(subRDD.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]


In [17]:
print(xrangeRDD.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


### Caching data

In [18]:
import time

testData = sc.parallelize(range(1, 50000), 4)
testData.cache()

t1 = time.time()
# first time it will calculate and also cache
count1 = testData.count()
dt1 = time.time() - t1
print("dt1:", dt1)

t2 = time.time()
count2 = testData.count()
dt2 = time.time() - t2
print("dt2:", dt2)

dt1: 0.7026107311248779
dt2: 0.2304394245147705


# DataFrames and SparkSQL

In [19]:
# In order to work with the extremely powerful SQL engine in Apache Spark, we will need a Spark Session. We have created that above, 
# let us verify that spark session is still active
spark

### Create your first DataFrame

In [20]:
# Download the data first into a local `people.json` file
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/data/people.json >> people.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    73  100    73    0     0    378      0 --:--:-- --:--:-- --:--:--   378


In [21]:
# Read the dataset into a spark dataframe using the `read.json()` function
df = spark.read.json("people.json").cache()

In [22]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [23]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [24]:
# In order to work with SparkSQL, Register the DataFrame as a SQL temporary view (for more details review notes)
df.createTempView("people")

### Explore the data using DataFrame functions and SparkSQL

In [26]:
# below all do the same thing
df.select("name").show() # using DataFrame function
df.select(df["name"]).show() # using DataFrame function
spark.sql("SELECT name FROM people").show() # using SparkSQL

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [27]:
# Perfrom basic filtering operations
df.filter(df["age"] > 21).show()
spark.sql("SELECT * FROM people WHERE age > 21").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [28]:
# Perfrom basic data aggregation opeartions
df.groupby("age").count().show()
spark.sql("SELECT age, COUNT(*) as count FROM people GROUP BY age").show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



# End Spark Session

In [29]:
spark.stop()