In [1]:
my_list = [11, 12, 17, 14, 10, 13]
odd_numbers = list(filter(lambda x: x%2 != 0, my_list))

In [3]:
print(f"odd_numbers {odd_numbers}")

odd_numbers [11, 17, 13]


In [6]:
def square(x):
  return x**2

squared_numbers = list(map(square, my_list))

print(f"squared_numbers = {squared_numbers}")

squared_numbers = [121, 144, 289, 196, 100, 169]


In [7]:
from functools import reduce
max_number = reduce(lambda x, y: x if x>y else y, my_list)
print("max_number = {}".format(max_number))

max_number = 17


#### Operations with RDD

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession



In [2]:
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
my_list = [11, 12, 17, 14, 10, 13]
rdd = spark.sparkContext.parallelize(my_list)
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [5]:
dept = [["Finance",10], 
        ["Marketing",20], 
        ["Sales",30], 
        ["IT",40] 
      ]
deptColumns = ["dept_name","dept_id"]
deptDF1 = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF1.printSchema()

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)



In [18]:
rdd.collect()

[11, 12, 17, 14, 10, 13]

In [20]:
rdd.getNumPartitions()

1

In [21]:
odd_numbers_rdd = rdd.filter(lambda x: x%2 != 0)

In [22]:
odd_numbers_rdd

PythonRDD[1] at RDD at PythonRDD.scala:53

In [23]:
lsit_of_odd_numbers = odd_numbers_rdd.collect()
print("RDD of odd numbers = {}".format(lsit_of_odd_numbers))

RDD of odd numbers = [11, 17, 13]


In [24]:
sorted_rdd = rdd.sortBy(lambda x: x).collect()
print("Sorted RDD = {}".format(sorted_rdd))

Sorted RDD = [10, 11, 12, 13, 14, 17]


In [25]:
max_val = rdd.reduce(lambda x, y: x if x>y else y)
print("The max value from the RDD is {}".format(max_val))

The max value from the RDD is 17


#### Aggregations with RDD

In [42]:
input_rdd = spark.sparkContext.textFile("./data/weather.csv",2)

In [43]:
input_rdd.getNumPartitions()

2

In [71]:
rdd1 = input_rdd.map(lambda x: (x[0],x[1],x[2]))

In [72]:
rdd1

PythonRDD[31] at RDD at PythonRDD.scala:53

In [73]:
rdd1.take(5)

[('2', '0', '1'),
 ('2', '0', '1'),
 ('2', '0', '1'),
 ('2', '0', '1'),
 ('2', '0', '1')]

In [61]:
input_rdd.take(5)

['2016-05-09,234893,34',
 '2019-09-08,234896,3',
 '2019-11-19,234895,24',
 '2017-04-04,234900,43',
 '2013-12-04,234900,47']

In [58]:
rdd1.take(5)

['2016-05-09,234893,34',
 '2019-09-08,234896,3',
 '2019-11-19,234895,24',
 '2017-04-04,234900,43',
 '2013-12-04,234900,47']

In [52]:
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

rdd = spark.sparkContext.parallelize(data)

In [53]:
rdd.collect()

[('James', 'Smith', 'USA', 'CA'),
 ('Michael', 'Rose', 'USA', 'NY'),
 ('Robert', 'Williams', 'USA', 'CA'),
 ('Maria', 'Jones', 'USA', 'FL')]

In [62]:
result = rdd.map(lambda x: (x[0],x[1],x[2])).collect()

In [63]:
result

[('James', 'Smith', 'USA'),
 ('Michael', 'Rose', 'USA'),
 ('Robert', 'Williams', 'USA'),
 ('Maria', 'Jones', 'USA')]

In [74]:
selected_fields_rdd = input_rdd.map(lambda line: (int(line.split(",")[0].split("-")[0]), int(line.split(",")[2])))

In [75]:
selected_fields_rdd.take(5)

[(2016, 34), (2019, 3), (2019, 24), (2017, 43), (2013, 47)]

In [76]:
# Get the maximum temperature corresponding to each year
max_temperature_rdd = selected_fields_rdd.reduceByKey(lambda x, y: x if x>y else y)

In [77]:
max_temperature_rdd.take(5)

[(2016, 36), (2018, 45), (2010, 39), (2014, 35), (2012, 40)]

In [78]:
result = max_temperature_rdd.collect()

In [79]:
print(result)

[(2016, 36), (2018, 45), (2010, 39), (2014, 35), (2012, 40), (2019, 47), (2017, 47), (2013, 47), (2015, 41), (2011, 38)]
