In [None]:
from pyspark.sql import SparkSession

# Create SparkSession (this will also initialize SparkContext)
spark = SparkSession.builder.master("local").appName("RDD Example").getOrCreate()

# Access the SparkContext from the SparkSession
sc = spark.sparkContext

# Create RDD with the first 15 natural numbers
numbers_rdd = sc.parallelize(range(1, 16))

# Collect and print the elements of the RDD
print(numbers_rdd.collect())

# Stop the Spark session (optional)
spark.stop()




[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [None]:
# Show elements
print(numbers_rdd.collect())

# Show number of partitions
print("Number of partitions: ", numbers_rdd.getNumPartitions())


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Number of partitions:  1


In [None]:
from pyspark.sql import SparkSession

# Re-initialize SparkSession and SparkContext
spark = SparkSession.builder.master("local").appName("RDD Example").getOrCreate()
sc = spark.sparkContext

# Re-create the RDD
numbers_rdd = sc.parallelize(range(1, 16))

# Now perform operations on the RDD
first_element = numbers_rdd.first()
print("First element in the RDD:", first_element)

First element in the RDD: 1


In [None]:
even_numbers_rdd = numbers_rdd.filter(lambda x: x % 2 == 0)
print("Even numbers: ", even_numbers_rdd.collect())


Even numbers:  [2, 4, 6, 8, 10, 12, 14]


In [None]:
squared_rdd = numbers_rdd.map(lambda x: x ** 2)
print("Squared numbers: ", squared_rdd.collect())


Squared numbers:  [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225]


In [None]:
sum_of_elements = numbers_rdd.reduce(lambda x, y: x + y)
print("Sum of elements: ", sum_of_elements)


Sum of elements:  120


In [None]:
numbers_rdd.saveAsTextFile("numbers_rdd_output.txt")


In [None]:
another_rdd = sc.parallelize([16, 17, 18, 19, 20])
combined_rdd = numbers_rdd.union(another_rdd)
print("Combined RDD: ", combined_rdd.collect())


Combined RDD:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [None]:
cartesian_rdd = numbers_rdd.cartesian(another_rdd)
print("Cartesian Product: ", cartesian_rdd.collect())


Cartesian Product:  [(1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (4, 16), (4, 17), (4, 18), (4, 19), (4, 20), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (6, 16), (6, 17), (6, 18), (6, 19), (6, 20), (7, 16), (7, 17), (7, 18), (7, 19), (7, 20), (8, 16), (8, 17), (8, 18), (8, 19), (8, 20), (9, 16), (9, 17), (9, 18), (9, 19), (9, 20), (10, 16), (10, 17), (10, 18), (10, 19), (10, 20), (11, 16), (11, 17), (11, 18), (11, 19), (11, 20), (12, 16), (12, 17), (12, 18), (12, 19), (12, 20), (13, 16), (13, 17), (13, 18), (13, 19), (13, 20), (14, 16), (14, 17), (14, 18), (14, 19), (14, 20), (15, 16), (15, 17), (15, 18), (15, 19), (15, 20)]


In [None]:
dict_rdd = sc.parallelize([{'a': 1}, {'b': 2}, {'c': 3}])
print("Dictionary RDD: ", dict_rdd.collect())


Dictionary RDD:  [{'a': 1}, {'b': 2}, {'c': 3}]


In [None]:
data = sc.parallelize([1, 2, 2, 3, 3, 3, 4])
count_rdd = data.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
print("Unique values with counts: ", count_rdd.collect())


Unique values with counts:  [(1, 1), (2, 2), (3, 3), (4, 1)]


In [None]:
# Create RDD by combining multiple text files (file1.txt and file2.txt)
rdd_from_files = sc.textFile("file1.txt,file2.txt")

# Show the content from the text files
print("RDD from text files:", rdd_from_files.collect())


RDD from text files: ['This is the first file.', 'It contains some text data.', 'Spark is awesome.', 'This is the second file.', 'It also contains text data.', "Let's see how Spark reads it."]


In [None]:
# Use the `take` action to retrieve the first 5 lines
first_five_lines = rdd_from_files.take(5)
print("First 5 lines of the RDD:", first_five_lines)


First 5 lines of the RDD: ['This is the first file.', 'It contains some text data.', 'Spark is awesome.', 'This is the second file.', 'It also contains text data.']


In [None]:
# Create a DataFrame
data = [("sachin", 10), ("rohit", 45), ("virat", 18)]
df = spark.createDataFrame(data, ["Name", "Jersey"])

# Show the DataFrame
df.show()


+------+------+
|  Name|Jersey|
+------+------+
|sachin|    10|
| rohit|    45|
| virat|    18|
+------+------+



In [None]:
# RDD Example (low-level operations)
rdd_example = sc.parallelize([("sachin", 10), ("rohit", 45), ("virat", 18)])
df_example = spark.createDataFrame([("sachin", 10), ("rohit", 45), ("virat", 18)], ["Name", "jersey"])
df_example.show()


+------+------+
|  Name|jersey|
+------+------+
|sachin|    10|
| rohit|    45|
| virat|    18|
+------+------+

