In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD Demo").getOrCreate()
sc = spark.sparkContext


In [2]:
# Problem 1: Word Count with Filtering (Key-Value RDD)
# Objective: Count the number of times each word appears, excluding stopwords.

# Steps:

# Load a list of sentences into an RDD.
# Split each sentence into words.
# Remove common stopwords like "is", "the", "a", "an", etc.
# Create key-value pairs (word, 1).
# Use reduceByKey to get the final word counts.

# Transformations: flatMap, filter, map, reduceByKey
# Actions: collect, take

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

rdd = sc.parallelize([
                     "It is a very good idea to be on time.",
                     "She was in the house with them.",
                     "You should have been there before him.",
                     "They are doing it for all of us.",
                     "I will not be the one to do it."
                    ])


filtered_rdd = rdd.flatMap(lambda line : [word for word in line.replace('.','').replace(',','').replace('!','').lower().split() if word not in stop_words])
individual_word_count = filtered_rdd.map(lambda word : (word,1))
final_count = individual_word_count.reduceByKey(lambda a,b : a + b)
print(final_count.collect())

# OR

# def filter_stop_words(sentence_line):
#     filtered = []
#     cleaned_line = sentence_line.replace('.', '').replace(',', '').replace('!', '').lower()
#     words = cleaned_line.split()
#     for word in words:
#         if word not in stop_words:
#             filtered.append(word)
#     return filtered

# rdd = sc.parallelize([
#                      "It is a very good idea to be on time.",
#                      "She was in the house with them.",
#                      "You should have been there before him.",
#                      "They are doing it for all of us.",
#                      "I will not be the one to do it."
#                     ])

# filtered_rdd = rdd.flatMap(filter_stop_words)
# individual_word_count = filtered_rdd.map(lambda word : (word,1))
# final_count = individual_word_count.reduceByKey(lambda a,b : a + b)
# print(final_count.collect())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('house', 1), ('good', 1), ('time', 1), ('one', 1), ('idea', 1), ('us', 1)]


In [3]:
# Problem 2: Average Score per Student (Key-Value RDD)
# Objective: Calculate average score of each student from a list of (name, score).

# Input:
# data = [("Alice", 80), ("Bob", 90), ("Alice", 70), ("Bob", 85), ("Charlie", 60)]

# Steps:

# Map to key-value: (name, (score, 1))
# Use reduceByKey to sum scores and counts: (name, (total_score, count))
# Map to calculate average.

# Transformations: map, reduceByKey
# Actions: collect, takeOrdered

data = [("Alice", 80), ("Bob", 90), ("Alice", 70), ("Bob", 85), ("Charlie", 60)]
rdd = sc.parallelize(data)
key_value = rdd.map(lambda x : (x[0],(x[1],1)))
key_value_reduce = key_value.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
averages = key_value_reduce.map(lambda x: (x[0], x[1][0] / x[1][1]))

print(averages.collect())
print(averages.takeOrdered(3))
print(averages.takeOrdered(3,key = lambda x : -x[1]))


[('Alice', 75.0), ('Bob', 87.5), ('Charlie', 60.0)]
[('Alice', 75.0), ('Bob', 87.5), ('Charlie', 60.0)]
[('Bob', 87.5), ('Alice', 75.0), ('Charlie', 60.0)]


In [4]:
# Problem 3: Frequency of Each Number in List (List RDD)
# Objective: From a list of numbers, count frequency of each number and sort in descending order of frequency.

# Input:
# numbers = [5, 3, 4, 5, 2, 3, 5, 3, 4]

# Steps:

# Convert to (number, 1)
# Use reduceByKey to count occurrences
# Swap to (count, number) and sort descending
# Return top 3 frequent numbers

# Transformations: map, reduceByKey, map, sortByKey
# Actions: take, collect

numbers_rdd = sc.parallelize([5, 3, 4, 5, 2, 3, 5, 3, 4])
freq_of_num = numbers_rdd.map(lambda n : (n,1))
final_freq = freq_of_num.reduceByKey(lambda a,b : a + b)
print("Before swapping, result is in format (number,count)")
print(final_freq.collect())
print("After swapping, result is in format (count,number)")
swap = final_freq.map(lambda x : (x[1],x[0]))
print(swap.collect())
print("Sorting in descending")
sorted_rdd = swap.sortByKey(ascending=False)
print(sorted_rdd.take(3))

# print(sorted_rdd.collect())


Before swapping, result is in format (number,count)
[(2, 1), (3, 3), (4, 2), (5, 3)]
After swapping, result is in format (count,number)
[(1, 2), (3, 3), (2, 4), (3, 5)]
Sorting in descending
[(3, 3), (3, 5), (2, 4)]


In [5]:
sc.stop()