In [1]:
import sys
print("Python version")
print(sys.version)

import pyspark
print("PySpark version")
print(pyspark.__version__)


Python version
3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 11:57:02) [GCC 12.3.0]
PySpark version
3.5.0


## 1. Spark Configuration

In [2]:
from pyspark import SparkConf

master = "local[*]"
app_name = "Introduction to Apache Spark"
spark_conf = SparkConf().setMaster(master).setAppName(app_name)

## 2. SparkContext and SparkSession

The class `pyspark.SparkContext` creates a client which connects to a Spark cluster

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import warnings

warnings.filterwarnings('ignore')

# Method 1: Using SparkSession
# spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
# sc = spark.sparkContext
# sc.setLogLevel('ERROR')

# Method 2: Getting or instantiating a SparkContext
sc = SparkContext.getOrCreate(spark_conf)
sc.setLogLevel('ERROR')

## 3. Create an RDD object

. This client can be used to create an RDD object. There are two methods from this class for directly creating RDD objects:
* `parallelize()`
* `textFile()`

### 3.1. parallelize()

- `parallelize()` distribute a local **python collection** to form an RDD. Common built-in python collections include `list`, `tuple` or `set`.
- By default the number of partitions will be the number of threads. The function parallelize can have a second argument to indicate manually how many


In [5]:
## FROM LIST
data = [i for i in range(20)]
print(f'raw list: {data}\n')

rdd = sc.parallelize(data)
print('Default partitions: ',rdd.getNumPartitions())

rdd = sc.parallelize(data,5)
print('Manual partitions: ',rdd.getNumPartitions(),'\n')

# Show the data
print(f'rdd data: {rdd.collect()}')

raw list: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

Default partitions:  8
Manual partitions:  5 

rdd data: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [8]:
## FROM LIST OF TUPLE
data = [('cat', 'dog', 'fish'), ('orange', 'apple')]
print(f'raw list of tuple: {data}')
rdd = sc.parallelize(data)
print(f'rdd data: {rdd.collect()}\n')

## FROM A SET
data = {'cat', 'dog', 'fish', 'cat', 'dog', 'dog'}
print(f'raw set: {data}')
rdd = sc.parallelize(data)
print(f'rdd data: {rdd.collect()}')

raw list of tuple: [('cat', 'dog', 'fish'), ('orange', 'apple')]
rdd data: [('cat', 'dog', 'fish'), ('orange', 'apple')]

raw set: {'cat', 'fish', 'dog'}
rdd data: ['cat', 'fish', 'dog']


### 3.2. textFile()

- The `textFile()` function reads a text file and returns it as an **RDD of strings**.
- Usually, you will need to apply some **map** functions to transform each elements of the RDD to some data structure/type that is suitable for data analysis.
- When using `textFile()`, each line of the text file becomes an element in the resulting RDD.
- Read a txt file, you can use the same function for a csv file

In [9]:
rdd = sc.textFile('harden.txt')
print('Default partitions: ',rdd.getNumPartitions())

# with number of partitions for the data
rdd = sc.textFile('harden.txt', 5)

# Verify the new number of partitions of the data 
print('Manual partitions: ',rdd.getNumPartitions(),"\n")

print("Here is the data: ")
rdd.take(10)

Default partitions:  2
Manual partitions:  5 

Here is the data: 


['James Edward Harden Jr. (born August 26, 1989) is an American professional basketball player for the Los Angeles Clippers of the National Basketball Association (NBA)',
 'He is widely regarded as one of the greatest scorers and shooting guards in NBA history.',
 "In 2021, Harden was honored as one of the league's top 75 players by being named to the NBA 75th Anniversary Team.",
 'Harden played college basketball for the Arizona State Sun Devils, where he was named a consensus All-American and Pac-10 Player of the Year in 2009.',
 '',
 'Harden was selected with the third overall pick in the 2009 NBA draft by the Oklahoma City Thunder.',
 '',
 '',
 'In 2012, he was named NBA Sixth Man of the Year and helped Oklahoma City reach the NBA Finals, where they lost to the Miami Heat in five games.',
 'After the Thunder refused to offer him a max contract extension after lost to Miami, Harden was unwilling to take a pay cut and was subsequently traded to the Houston Rockets before the 2012–13 

## 4. RDD Operations
RDDs support two types of operations: 
- Transformations (operations on RDDs that return a new RDD, such as `map()` and `filter()`)
- Actions (not return new RDD, such as `count()` and `first()`)

**Element-wise transformations:**

* `map()`
* `mapValues()`
* `flatMap()`
* `flatMapValues()`

### 4.1. map

The `map()` method applies a function to each elements of the RDD. Each element has to be a valid input to the function. The returned RDD has the function outputs as its new elements.

In [10]:
# Read data into RDD

rdd = sc.textFile('nba.csv')
print('Default partitions: ',rdd.getNumPartitions())
rdd.take(5)

Default partitions:  2


['Name,Shoot,Dribble,Dunk,Defence',
 'James Harden,90,85,80,65',
 'Kevin Durant,95,80,90,85',
 'Stephen Curry,99,85,70,80',
 'Lebron James,85,80,99,95']

In [11]:
## Remove header

def removeHeaderRow(rdd):
    header = rdd.first()
    return rdd.filter(lambda x: x != header)
    
rdd_1 = removeHeaderRow(rdd)
rdd_1.take(5)

['James Harden,90,85,80,65',
 'Kevin Durant,95,80,90,85',
 'Stephen Curry,99,85,70,80',
 'Lebron James,85,80,99,95',
 'Kyre Irving,88,99,80,80']

In [12]:
## Split NBA player name with his rating

def parseRecord(line):
    array_line = line.split(',')
    return (array_line[0], array_line[1:])
    
rdd_2 = rdd_1.map(parseRecord)
rdd_2.take(5)

[('James Harden', ['90', '85', '80', '65']),
 ('Kevin Durant', ['95', '80', '90', '85']),
 ('Stephen Curry', ['99', '85', '70', '80']),
 ('Lebron James', ['85', '80', '99', '95']),
 ('Kyre Irving', ['88', '99', '80', '80'])]

In [13]:
## Convert string values to numeric values

rdd_3 = rdd_2.map(lambda x: (x[0], list(map(float, x[1]))))
rdd_3.take(5)

[('James Harden', [90.0, 85.0, 80.0, 65.0]),
 ('Kevin Durant', [95.0, 80.0, 90.0, 85.0]),
 ('Stephen Curry', [99.0, 85.0, 70.0, 80.0]),
 ('Lebron James', [85.0, 80.0, 99.0, 95.0]),
 ('Kyre Irving', [88.0, 99.0, 80.0, 80.0])]

### 4.2 `mapValues`

The `mapValues` function requires that each element in the RDD has a **key/value** pair structure, for example, a tuple of 2 items, or a list of 2 items. The `mapValues` function applies a function to each of the element values. The element key will remain unchanged.

In [14]:
import numpy as np

rdd_4 = rdd_3.mapValues(lambda x: np.mean(x))
rdd_4.take(5)

[('James Harden', 80.0),
 ('Kevin Durant', 87.5),
 ('Stephen Curry', 83.5),
 ('Lebron James', 89.75),
 ('Kyre Irving', 86.75)]

### 4.3 `flatMap`

This function first applies a function to each elements of an RDD and then flatten the results. We can simply use this function to flatten elements of an RDD without extra operation on each elements.


In [15]:
data = [('Stephen Curry', 'Klay Thompson'), ('Paul George', 'James Harden', 'Russel Westbrook', 'Kawhi Leonard'), ('Giannis Antetokuonmpo', 'Damian Lilliard')]
rdd_5 = sc.parallelize(data)
print(f'Before flatMap: \n{rdd_5.collect()}\n')

rdd_5 = rdd_5.flatMap(lambda x: x)
print(f'After flatMap: \n{rdd_5.collect()}\n')

Before flatMap: 
[('Stephen Curry', 'Klay Thompson'), ('Paul George', 'James Harden', 'Russel Westbrook', 'Kawhi Leonard'), ('Giannis Antetokuonmpo', 'Damian Lilliard')]

After flatMap: 
['Stephen Curry', 'Klay Thompson', 'Paul George', 'James Harden', 'Russel Westbrook', 'Kawhi Leonard', 'Giannis Antetokuonmpo', 'Damian Lilliard']



### 4.5 `flatMapValues`

The `flatMapValues` function requires that each element in the RDD has a **key/value** pair structure. It applies a function to each **element value** of the RDD object and then flatten the results.

In [16]:
data = [
    ["James Harden", (90.0, 85.0, 80.0, 65.0)],
    ["Kevin Durant", (95.0, 80.0, 90.0, 85.0)],
    ["Stephen Curry", (99.0, 85.0, 70.0, 80.0)]
]
rdd_6 = sc.parallelize(data)
rdd_6.collect()

[['James Harden', (90.0, 85.0, 80.0, 65.0)],
 ['Kevin Durant', (95.0, 80.0, 90.0, 85.0)],
 ['Stephen Curry', (99.0, 85.0, 70.0, 80.0)]]

In [17]:
rdd_6 = rdd_6.flatMapValues(lambda x: list(zip(["Shoot", "Dribble", "Dunk", "Defence"], x)))
rdd_6.collect()

[('James Harden', ('Shoot', 90.0)),
 ('James Harden', ('Dribble', 85.0)),
 ('James Harden', ('Dunk', 80.0)),
 ('James Harden', ('Defence', 65.0)),
 ('Kevin Durant', ('Shoot', 95.0)),
 ('Kevin Durant', ('Dribble', 80.0)),
 ('Kevin Durant', ('Dunk', 90.0)),
 ('Kevin Durant', ('Defence', 85.0)),
 ('Stephen Curry', ('Shoot', 99.0)),
 ('Stephen Curry', ('Dribble', 85.0)),
 ('Stephen Curry', ('Dunk', 70.0)),
 ('Stephen Curry', ('Defence', 80.0))]

In [18]:
# Unpack the element values
rdd_6 = rdd_6.map(lambda x: [x[0]] + list(x[1]) )
rdd_6.collect()

[['James Harden', 'Shoot', 90.0],
 ['James Harden', 'Dribble', 85.0],
 ['James Harden', 'Dunk', 80.0],
 ['James Harden', 'Defence', 65.0],
 ['Kevin Durant', 'Shoot', 95.0],
 ['Kevin Durant', 'Dribble', 80.0],
 ['Kevin Durant', 'Dunk', 90.0],
 ['Kevin Durant', 'Defence', 85.0],
 ['Stephen Curry', 'Shoot', 99.0],
 ['Stephen Curry', 'Dribble', 85.0],
 ['Stephen Curry', 'Dunk', 70.0],
 ['Stephen Curry', 'Defence', 80.0]]

### 4.6. Accumulator

Accumulators provides a simple syntax for aggregating values from worker nodes back to the driver program.
They are only “added” to through an associative and commutative operation and can therefore be efficiently supported in parallel. 

In [19]:
## This code will produce an unexpected result

rdd_7 = sc.textFile('harden.txt', 3)
blank_lines = 0 # global variable

def extract_blank_lines(line):
    global blank_lines
    if line == "":
        blank_lines += 1
    return line.split(" ")
        
rdd_7 = rdd_7.flatMap(extract_blank_lines)
rdd_7.collect()

print("Blank lines: %d" %blank_lines)

Blank lines: 0


In [20]:
rdd_7 = sc.textFile('harden.txt', 3)
blank_lines = sc.accumulator(0) # Create Accumulator[int] intitialized to 0

def extract_blank_lines(line):
    global blank_lines
    if line == "":
        blank_lines += 1
    return line.split(" ")
        
rdd_7 = rdd_7.flatMap(extract_blank_lines)
rdd_7.collect()

print("Blank lines: %d" %blank_lines.value)

Blank lines: 3


In [21]:
city_words = {'Oklahoma', 'Los Angeles', 'Miami', 'New York'}
city_words_count = sc.accumulator(0)

broadcast_city_words = sc.broadcast(city_words)

rdd_8 = sc.textFile('harden.txt', 3)
rdd_8.collect()

def check_city_words(line):
    global city_words_count
    words = line.split(" ")
    for word in words:
        if word in broadcast_city_words.value: # using the broadcast variable in the function that runds on the cluster
            city_words_count += 1
            
rdd_8 = rdd_8.filter(check_city_words)
rdd_8.collect()

print("City words: %d" %city_words_count.value)

City words: 3


# 4. Visualisation of parallelism execution in Spark

In [22]:
# Stop the SparkContext
sc.stop()

# Start new SparkContext
sc = SparkContext.getOrCreate(spark_conf)

In [23]:
# We will copy the previous code that reads the file mtcars.csv
rdd = sc.textFile('nba.csv',5)

header = rdd.first()
rdd_1 = rdd.filter(lambda x: x != header)

# parseRecord is already implemented
rdd_2 = rdd_1.map(parseRecord)
rdd_3 = rdd_2.map(lambda x: (x[0], list(map(float, x[1]))))

# Verify the number of partitions
print('Default partitions: ',rdd.getNumPartitions())

# Open localhost:4040

Default partitions:  5


In [24]:
# Perform collect() action to execute the job
rdd_3.collect()

[('James Harden', [90.0, 85.0, 80.0, 65.0]),
 ('Kevin Durant', [95.0, 80.0, 90.0, 85.0]),
 ('Stephen Curry', [99.0, 85.0, 70.0, 80.0]),
 ('Lebron James', [85.0, 80.0, 99.0, 95.0]),
 ('Kyre Irving', [88.0, 99.0, 80.0, 80.0]),
 ('Kyle Thompson', [95.0, 80.0, 80.0, 85.0]),
 ('Luka Doncic', [88.0, 80.0, 85.0, 80.0]),
 ('Vucevic', [75.0, 75.0, 85.0, 90.0])]