Skip to content

Latest commit

 

History

History
111 lines (64 loc) · 1.45 KB

spark_maps_and_lazy_evaluation.md

File metadata and controls

111 lines (64 loc) · 1.45 KB

Maps

import pyspark
sc = pyspark.SparkContext(appName="maps_and_lazy_evaluation_example")

log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

# parallelize the log_of_songs to use with Spark
distributed_song_log = sc.parallelize(log_of_songs)
def convert_song_to_lowercase(song):
    return song.lower()

convert_song_to_lowercase("Havana")
'havana'
distributed_song_log.map(convert_song_to_lowercase)
PythonRDD[1] at RDD at PythonRDD.scala:53
distributed_song_log.map(convert_song_to_lowercase).collect()
['despacito',
 'nice for what',
 'no tears left to cry',
 'despacito',
 'havana',
 'in my feelings',
 'nice for what',
 'despacito',
 'all the stars']
distributed_song_log.collect()
['Despacito',
 'Nice for what',
 'No tears left to cry',
 'Despacito',
 'Havana',
 'In my feelings',
 'Nice for what',
 'despacito',
 'All the stars']
distributed_song_log.map(lambda song: song.lower()).collect()
['despacito',
 'nice for what',
 'no tears left to cry',
 'despacito',
 'havana',
 'in my feelings',
 'nice for what',
 'despacito',
 'all the stars']