# Creating RDDs

In [None]:
#The os module provides a way of using operating system-dependent functionality, such as reading or setting environment variables.
import os
#os.getenv('PYSPARK_SUBMIT_ARGS'): Retrieves the value of the environment variable PYSPARK_SUBMIT_ARGS.
# If the environment variable is set, it returns its value as a string.
# If the environment variable is not set, it returns None.
os.getenv('PYSPARK_SUBMIT_ARGS')

In [None]:
# import os: Imports the os module, which provides functions to interact with the operating system.
# !export JAVA_HOME=$(/usr/libexec/java_home -v 1.8): This line is a shell command (using ! for execution in Jupyter notebooks) that sets the JAVA_HOME environment variable to point to the Java 1.8 installation directory.
# JAVA_HOME is used to specify the path to the Java Development Kit (JDK) required by Spark.
# /usr/libexec/java_home -v 1.8 is a macOS command to find the path of the Java 1.8 installation.
import os
!export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)



from pyspark import SparkConf, SparkContext
# This line sets the PYSPARK_PYTHON environment variable to specify which Python interpreter PySpark should use.
# In this case, it points to Python 3.10 located at /usr/local/Cellar/python@3.10/3.10.9/bin/python3.

os.environ['PYSPARK_PYTHON'] = '/usr/local/Cellar/python@3.10/3.10.9/bin/python3'

# from pyspark import SparkConf, SparkContext: Imports the SparkConf and SparkContext classes, which are used to configure and manage a Spark application.
# from pyspark.sql import SparkSession: Imports the SparkSession class, which is the entry point for working with Spark SQL and DataFrames.
# spark = SparkSession.builder.master("local").appName("rdd_demo").getOrCreate():
# .builder: Provides a way to configure and create a SparkSession.
# .master("local"): Sets the Spark master URL to local, meaning Spark will run locally on the machine. This is suitable for testing or small-scale tasks.
# .appName("rdd_demo"): Sets the name of the Spark application to "rdd_demo".
# .getOrCreate(): Creates a new SparkSession if one does not already exist or retrieves the existing one if it does.

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("rdd_demo").getOrCreate()

#sc.stop()
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext.getOrCreate()

24/07/05 09:27:23 WARN Utils: Your hostname, Akashs-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.55.206 instead (on interface en0)
24/07/05 09:27:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/05 09:27:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Purpose: This function processes a line of text.
# lines.lower(): Converts the entire line to lowercase to ensure case insensitivity.
# lines.split(" "): Splits the line into words based on spaces. This returns a list of words from the line.
# return lines: Returns the list of words.
def Func(lines):
    lines = lines.lower()
    lines = lines.split(" ")
    return lines

#sc.stop()
# SparkConf(): Creates a configuration object for Spark.
# .setMaster("local"): Configures Spark to run locally on your machine.
# .setAppName("wordcount"): Sets the application name to "wordcount".
# SparkContext.getOrCreate(): Retrieves an existing SparkContext or creates a new one if none exists.
conf = SparkConf().setMaster("local").setAppName("wordcount")
sc = SparkContext.getOrCreate()
# spth: Defines the path to the text file.
# sc.textFile(spth): Loads the text file from the specified path into an RDD (input_file), where each element of the RDD is a line from the file.
spth="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
input_file = sc.textFile(spth)

rdd1 = input_file.flatMap(Func)

rdd1.map(lambda x: (x,1)).take(5)

                                                                                

[('a', 1), ('scandal', 1), ('in', 1), ('bohemia', 1), ('i.', 1)]

## count

In [None]:
rdd1.count()

11427

In [None]:
rdd1.countApprox(1, 0.7)

11427

In [None]:
rdd1.countByValue()

defaultdict(int,
            {'a': 288,
             'scandal': 4,
             'in': 188,
             'bohemia': 2,
             'i.': 2,
             'to': 299,
             'sherlock': 14,
             'holmes': 27,
             'she': 67,
             'is': 114,
             'always': 7,
             'the': 556,
             'woman.': 3,
             'i': 280,
             'have': 88,
             'seldom': 3,
             'heard': 11,
             'him': 16,
             'mention': 1,
             'her': 40,
             'under': 10,
             'any': 10,
             'other': 12,
             'name.': 1,
             'his': 137,
             'eyes': 9,
             'eclipses': 1,
             'and': 305,
             'predominates': 1,
             'whole': 6,
             'of': 302,
             'sex.': 1,
             'it': 137,
             'was': 140,
             'not': 73,
             'that': 137,
             'he': 129,
             'felt': 2,
             'emotion': 2

In [None]:
spth="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
spth_1="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
spth_2="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
input_file = sc.textFile(spth_1,6)

# input_file.pipe('wc -l'):

# pipe: This is a Spark RDD transformation that allows you to apply a Unix shell command to the data in the RDD. Each element of the RDD is passed as input to the shell command.
# 'wc -l': This is the Unix command that is applied to the RDD. wc -l is a command that counts the number of lines in its input. So, when pipe('wc -l') is applied, it counts the number of lines in the input_file RDD.
# Behavior: For each partition of the RDD, Spark will execute the wc -l command on the data in that partition. The result from each partition is collected into a new RDD.
# collect():

# collect(): This action gathers all the elements of the RDD into a list on the driver node. It triggers the computation and returns the result as a list.
input_file.pipe('wc -l').collect()
#input_file.pipe('head').take(5)
### why 2 outputs???

                                                                                

['      79', '     165', '      90', '      92', '     122', '      88']

In [None]:
input_file.count()

                                                                                

636

In [None]:
print(type(input_file))
print(input_file.count())
print(input_file.getNumPartitions())
print(sc.defaultParallelism)

<class 'pyspark.rdd.RDD'>




636
6
1




## saving file

In [None]:
input_file.saveAsTextFile('/Users/aakash/Downloads/spark-code/data/wc3.txt')
## check outputs

## glom

In [None]:
sc.parallelize(["Hello"," World", "today",'is','an','awesome','day'],3)

ParallelCollectionRDD[4] at readRDDFromFile at PythonRDD.scala:289

In [None]:
sc.parallelize(["Hello"," World", "today",'is','an','awesome','day'],3).glom().collect()

[['Hello', ' World'], ['today', 'is'], ['an', 'awesome', 'day']]

## controlling partitioning

In [None]:
print(input_file.getNumPartitions())
print(input_file.coalesce(2).getNumPartitions())
#### no shuffle is performed through coalesce

6
2


In [None]:
print(input_file.getNumPartitions())
print(input_file.repartition(5).getNumPartitions())
input_file=input_file.repartition(5)
print(input_file.getNumPartitions())

#### repartition does shuffle
#### changes are not permanent in RDD until recreated

1
5
5


In [None]:
df=spark.read.option("header","true").option("inferSchema", "true").csv("/Users/aakash/Downloads/spark-code/data/retail-data/all")
rdd=df.coalesce(10).rdd
print(df.count())
print(df.printSchema())

[Stage 4:>                                                          (0 + 1) / 1]

541909
root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)

None


                                                                                

In [None]:
'''
keyedRDD\
  .partitionBy(3, partitionFunc)\ ## partitions input into 3
  .map(lambda x: x[0])\           #fetches only the key value from each partition
  .glom()\                        #puts all keys in a partition in a list
  .map(lambda x: len(set(x)))\    #gets count of unique values in each list
  .count()                        #counts inputs, ie, 3 lists
'''

import random
def partitionFunc(key):
    if key == 17850 or key == 12583:
        return 0
    else:
        return random.randint(1,2)

how_many_parts=3
keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
  .partitionBy(how_many_parts, partitionFunc)\
  .map(lambda x: x[0])\
  .glom()\
  .map(lambda x: len(set(x)))\
  .take(how_many_parts)
### This custom partitioning logic is only available at RDD level

[2, 4302, 4313]

In [None]:
rdd.take(1)

[Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom')]

In [None]:
import random
def partitionFunc(key):
    if key == 17850 or key == 12583:
        return 0
    else:
        return random.randint(1,2)

#keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
  .partitionBy(3, partitionFunc)\
  .take(5)

                                                                                

[(17850,
  Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom')),
 (17850,
  Row(InvoiceNo='536365', StockCode='71053', Description='WHITE METAL LANTERN', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=3.39, CustomerID=17850, Country='United Kingdom')),
 (17850,
  Row(InvoiceNo='536365', StockCode='84406B', Description='CREAM CUPID HEARTS COAT HANGER', Quantity=8, InvoiceDate='12/1/2010 8:26', UnitPrice=2.75, CustomerID=17850, Country='United Kingdom')),
 (17850,
  Row(InvoiceNo='536365', StockCode='84029G', Description='KNITTED UNION FLAG HOT WATER BOTTLE', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=3.39, CustomerID=17850, Country='United Kingdom')),
 (17850,
  Row(InvoiceNo='536365', StockCode='84029E', Description='RED WOOLLY HOTTIE WHITE HEART.', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=3.39, CustomerID=17850, Countr

In [None]:
# keyedRDD.partitionBy(3, partitionFunc):

# partitionBy(3): Re-partitions the RDD into 3 partitions.
# partitionFunc: A custom partitioning function that determines how the data is distributed across the 3 partitions.
# This function takes a key from the key-value pairs in the RDD and returns the partition index (0, 1, or 2) for that key.
# Effect: Re-distributes the key-value pairs in keyedRDD into 3 partitions according to the partitioning function.
# map(lambda x: x[0]):

# map(lambda x: x[0]): Transforms each key-value pair in the RDD to just the key (x[0]).
# This operation discards the value part of each key-value pair, leaving only the keys.
# glom():

# glom(): Collects the elements of each partition into a list.
# It converts each partition into a list of elements, resulting in an RDD where each element is a list of all elements in that partition.
# map(lambda x: len(set(x))):

# map(lambda x: len(set(x))): Computes the number of unique keys in each partition.
# x represents a list of keys in a partition.
# set(x) converts the list to a set, which removes duplicate keys.
# len(set(x)) counts the number of unique keys in the set.
# take(3):

# take(3): Retrieves the first 3 elements from the resulting RDD.
# Since the RDD now contains counts of unique keys for each partition, this operation fetches the counts for the first 3 partitions.
keyedRDD\
  .partitionBy(3, partitionFunc)\
  .map(lambda x: x[0])\
.glom()\
  .map(lambda x: len(set(x)))\
.take(3)


[2, 4299, 4296]

## Key value pairs

In [None]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
words.take(5)
### using map
#words.map(lambda word: (word.lower(),1)).take(5)

['Spark', 'The', 'Definitive', 'Guide', ':']

In [None]:
### using keyBy()
keyword=words.keyBy(lambda word: hash(word))
keyword.take(4)

[(-7604683196601698838, 'Spark'),
 (1381986379548814910, 'The'),
 (-4122473907452972367, 'Definitive'),
 (6601866536240625759, 'Guide')]

In [None]:
## using mapValues
keyword.mapValues(lambda x: x.upper()).collect()

[(-7604683196601698838, 'SPARK'),
 (1381986379548814910, 'THE'),
 (-4122473907452972367, 'DEFINITIVE'),
 (6601866536240625759, 'GUIDE'),
 (1827294539281609183, ':'),
 (-7752043373178716433, 'BIG'),
 (-151788679466617941, 'DATA'),
 (4204150861945194593, 'PROCESSING'),
 (146797214411928478, 'MADE'),
 (-4607150138042100501, 'SIMPLE')]

In [None]:
## extracting keys and/or values
print(keyword.keys().collect())
print(keyword.values().collect())

[-7604683196601698838, 1381986379548814910, -4122473907452972367, 6601866536240625759, 1827294539281609183, -7752043373178716433, -151788679466617941, 4204150861945194593, 146797214411928478, -4607150138042100501]
['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple']


## Aggregations

In [None]:
def addFunc(left, right):
    return(left + right)

def maxFunc(left, right):
    max(left, right)

In [None]:
# spth_2: Defines the path to the text file.
# sc.textFile(spth_2, 6): Loads the text file from the specified path into an RDD (input_file), and explicitly sets the number of partitions to 6. This means that the data will be divided into 6 partitions for parallel processing.
spth_2="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
input_file = sc.textFile(spth_2,6)
# flatMap(lambda word: word.split(" ")): Applies a function to each line of the input_file RDD.
# lambda word: word.split(" "): Splits each line (word) into individual words based on spaces.
# flatMap: Flattens the result into a single RDD of words. This means if a line contains multiple words, they will be split into separate elements in the resulting RDD.
chars=input_file.flatMap(lambda word: word.split(" "))
# map(lambda letter: (letter, 1)): Converts each word into a key-value pair where the key is the word and the value is 1.
# lambda letter: (letter, 1): Creates a tuple (letter, 1) for each word. This is the format required for counting occurrences.
KVcharacters=chars.map(lambda letter: (letter,1))
# countByKey(): Counts the number of occurrences for each key in the RDD.
# Result: Returns a dictionary where each key is a word, and each value is the count of occurrences of that word in the RDD.
KVcharacters.countByKey()



defaultdict(int,
            {'A': 12,
             'SCANDAL': 1,
             'IN': 1,
             'BOHEMIA': 1,
             'I.': 2,
             'To': 4,
             'Sherlock': 13,
             'Holmes': 27,
             'she': 54,
             'is': 113,
             'always': 7,
             'the': 525,
             'woman.': 3,
             'I': 280,
             'have': 88,
             'seldom': 2,
             'heard': 11,
             'him': 16,
             'mention': 1,
             'her': 40,
             'under': 10,
             'any': 10,
             'other': 12,
             'name.': 1,
             'In': 6,
             'his': 131,
             'eyes': 9,
             'eclipses': 1,
             'and': 289,
             'predominates': 1,
             'whole': 6,
             'of': 301,
             'sex.': 1,
             'It': 37,
             'was': 139,
             'not': 73,
             'that': 131,
             'he': 101,
             'felt': 2,
         

In [None]:
### groupByKey
spth="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
input_file = sc.textFile(spth)
#print(input_file.take(5))
rdd1 = input_file.flatMap(Func)
rdd2=rdd1.map(lambda x: (x,1)).groupByKey() \
    .mapValues(sum).map(lambda x: (x[1],x[0])) \
    .sortByKey(False)

rdd2.take(5)

[(556, 'the'), (321, ''), (305, 'and'), (302, 'of'), (299, 'to')]

In [None]:
KVcharacters=rdd1.map(lambda x: (x,1))
KVcharacters.take(5)

[('a', 1), ('scandal', 1), ('in', 1), ('bohemia', 1), ('i.', 1)]

In [None]:
### reduceByKey
KVcharacters.reduceByKey(lambda x,y: x+y).collect()

[('a', 288),
 ('scandal', 4),
 ('in', 188),
 ('bohemia', 2),
 ('i.', 2),
 ('to', 299),
 ('sherlock', 14),
 ('holmes', 27),
 ('she', 67),
 ('is', 114),
 ('always', 7),
 ('the', 556),
 ('woman.', 3),
 ('i', 280),
 ('have', 88),
 ('seldom', 3),
 ('heard', 11),
 ('him', 16),
 ('mention', 1),
 ('her', 40),
 ('under', 10),
 ('any', 10),
 ('other', 12),
 ('name.', 1),
 ('his', 137),
 ('eyes', 9),
 ('eclipses', 1),
 ('and', 305),
 ('predominates', 1),
 ('whole', 6),
 ('of', 302),
 ('sex.', 1),
 ('it', 137),
 ('was', 140),
 ('not', 73),
 ('that', 137),
 ('he', 129),
 ('felt', 2),
 ('emotion', 2),
 ('akin', 2),
 ('love', 6),
 ('for', 69),
 ('irene', 10),
 ('adler.', 2),
 ('all', 32),
 ('emotions,', 1),
 ('one', 29),
 ('particularly,', 1),
 ('were', 31),
 ('abhorrent', 1),
 ('cold,', 1),
 ('precise', 1),
 ('but', 56),
 ('admirably', 2),
 ('balanced', 1),
 ('mind.', 1),
 ('was,', 1),
 ('take', 7),
 ('it,', 11),
 ('most', 16),
 ('perfect', 1),
 ('reasoning', 2),
 ('observing', 2),
 ('machine', 1),


In [None]:
### aggregateByKey
KVcharacters.aggregateByKey(0, addFunc, addFunc).collect()

[('a', 288),
 ('scandal', 4),
 ('in', 188),
 ('bohemia', 2),
 ('i.', 2),
 ('to', 299),
 ('sherlock', 14),
 ('holmes', 27),
 ('she', 67),
 ('is', 114),
 ('always', 7),
 ('the', 556),
 ('woman.', 3),
 ('i', 280),
 ('have', 88),
 ('seldom', 3),
 ('heard', 11),
 ('him', 16),
 ('mention', 1),
 ('her', 40),
 ('under', 10),
 ('any', 10),
 ('other', 12),
 ('name.', 1),
 ('his', 137),
 ('eyes', 9),
 ('eclipses', 1),
 ('and', 305),
 ('predominates', 1),
 ('whole', 6),
 ('of', 302),
 ('sex.', 1),
 ('it', 137),
 ('was', 140),
 ('not', 73),
 ('that', 137),
 ('he', 129),
 ('felt', 2),
 ('emotion', 2),
 ('akin', 2),
 ('love', 6),
 ('for', 69),
 ('irene', 10),
 ('adler.', 2),
 ('all', 32),
 ('emotions,', 1),
 ('one', 29),
 ('particularly,', 1),
 ('were', 31),
 ('abhorrent', 1),
 ('cold,', 1),
 ('precise', 1),
 ('but', 56),
 ('admirably', 2),
 ('balanced', 1),
 ('mind.', 1),
 ('was,', 1),
 ('take', 7),
 ('it,', 11),
 ('most', 16),
 ('perfect', 1),
 ('reasoning', 2),
 ('observing', 2),
 ('machine', 1),


In [None]:
### inner joins
x = sc.parallelize([("a", 1), ("b", 4), ("c",5)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())


[('a', (1, 2)), ('a', (1, 3))]

In [None]:
## zips
y = sc.parallelize(zip(range(0,5), range(10,25)))
y.take(10)

[(0, 10), (1, 11), (2, 12), (3, 13), (4, 14)]

In [None]:
left_range = [0,1,2,3,4]
right_range = [10,11,12,13,14,15,16....25]


0,10
1,11
2,12
3,13
4,14

In [None]:
words.getNumPartitions()

2

## Broadcast variables

In [None]:
# my_collection: Splits the string into a list of words.
# spark.sparkContext.parallelize(my_collection, 2): Creates an RDD (words) from the my_collection list, with 2 partitions. This RDD contains each word from the list
my_collection = "Spark The Definitive Guide : Big Data \
Processing Made Simple".split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

# supplementalData: A dictionary where each key is a word and each value is an associated number. This data will be used to provide additional information for the words.
# spark.sparkContext.broadcast(supplementalData): Broadcasts the supplementalData dictionary to all worker nodes. This ensures that all workers have access to this data efficiently without having to send it multiple times.
supplementalData = {"Spark":1000, \
                    "Definitive":200,\
                    "Big":-300, \
                    "Simple":100, \
                    "Algebra": -1000
                   }

suppBroadcast = spark.sparkContext.broadcast(supplementalData)

print(suppBroadcast.value)

words.map(lambda word: (word, suppBroadcast.value.get(word, 0))) \
    .sortBy(lambda wordPair: wordPair[1]).collect()

{'Spark': 1000, 'Definitive': 200, 'Big': -300, 'Simple': 100, 'Algebra': -1000}


[('Big', -300),
 ('The', 0),
 ('Guide', 0),
 (':', 0),
 ('Data', 0),
 ('Processing', 0),
 ('Made', 0),
 ('Simple', 100),
 ('Definitive', 200),
 ('Spark', 1000)]

## How is a surrogate key lookup created in real applications

In [None]:
Start with an empty lookup (Map/dictionary, ie key value pairs)
lets say you wanted a lookup on mobile numbers
Start processing your input one row at a time (this can \
   be parallelised as well)

for every incoming mobile num check if it exists in your lookup
if it exists, fetch the key
if it doesnt:
    1. generate a new unique value as key for that incoming \
        mobile number
    2. store the newly generated mob. num, unique key pair in \
        lookup
    3. return the newly generated unique key to calling function

Now your lookup has one row
Go on repeating all above steps

## Accumulators

In [None]:
#from pyspark import SparkContext
#sc = SparkContext("local", "Accumulator app")
num = sc.accumulator(10)
def f(x):
    global num
    num += x

rdd = sc.parallelize([20,30,40,50])
rdd.foreach(f)
final = num.value
print ("Accumulated value is -> %i" % (final))

Accumulated value is -> 150


In [None]:
num = 10

input: 20
old value of num: 10
new value of num: 30

input: 30
old value of num: 30
new value of num: 60

input: 40
old value of num: 60
new value of num: 100

input: 50
old value of num: 100
new value of num: 150