In [1]:
import os

os.environ["JAVA_HOME"] = "/home/linuxbrew/.linuxbrew/opt/openjdk@11"

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("BasicSparkLab").getOrCreate()

# Check the SparkContext
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 17:23:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc

In [4]:
data = [21, 27, 43, 40, 75, 61, 70, 8, 9, 100, 11]
firstRDD = sc.parallelize(data)

In [5]:
print("First element of data:", data[0])
print("First element of firstRDD:", firstRDD.first())

First element of data: 21


[Stage 0:>                                                          (0 + 1) / 1]

First element of firstRDD: 21


                                                                                

In [6]:
print("Take 5 elements from firstRDD:", firstRDD.take(5))
print("First element using first():", firstRDD.first())

Take 5 elements from firstRDD: [21, 27, 43, 40, 75]
First element using first(): 21


In [7]:
secondRDD = firstRDD.map(lambda x: x + 123)
print("Elements of secondRDD:", secondRDD.collect())

Elements of secondRDD: [144, 150, 166, 163, 198, 184, 193, 131, 132, 223, 134]


In [8]:
max_firstRDD = firstRDD.max()
min_secondRDD = secondRDD.min()
print("Max element of firstRDD:", max_firstRDD)
print("Min element of secondRDD:", min_secondRDD)

Max element of firstRDD: 100
Min element of secondRDD: 131


In [9]:
thirdRDD = secondRDD.filter(lambda x: x > 150)
print("Elements of thirdRDD:", thirdRDD.collect())

Elements of thirdRDD: [166, 163, 198, 184, 193, 223]


In [10]:
data_path = os.path.join(os.getcwd(), "src", "homework_1", "data.txt")
text_rdd = sc.textFile(data_path)
print("Contents of data.txt:", text_rdd.collect())

Contents of data.txt: ['a sample of text', 'maybe number 5 or 15']


## Romeo & Juliet

In [11]:
data_path = os.path.join(os.getcwd(), "src", "homework_1", "romeo-juliet.txt")
romeo_juliet_rdd = sc.textFile(data_path)

In [12]:
line_count = romeo_juliet_rdd.count()
word_count = romeo_juliet_rdd.flatMap(lambda line: line.split()).count()
print("Number of lines:", line_count)
print("Number of words:", word_count)

Number of lines: 4853
Number of words: 28983


In [13]:
longest_phrase = romeo_juliet_rdd.reduce(lambda a, b: a if len(a) > len(b) else b)
print("Longest phrase:", longest_phrase)

Longest phrase: End of the Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare


In [14]:
word_counts = (
    romeo_juliet_rdd.flatMap(lambda line: line.split())
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .sortBy(lambda x: x[1], ascending=False)
)
top_10_words = word_counts.take(10)
print("Top 10 most frequent words:", top_10_words)

Top 10 most frequent words: [('the', 762), ('I', 549), ('and', 539), ('to', 522), ('of', 485), ('a', 453), ('in', 330), ('is', 322), ('my', 310), ('with', 274)]


In [15]:
romeo_count = romeo_juliet_rdd.filter(lambda line: "Rom." in line).count()
juliet_count = romeo_juliet_rdd.filter(lambda line: "Jul." in line).count()
print("Romeo speaks:", romeo_count, "times")
print("Juliet speaks:", juliet_count, "times")

Romeo speaks: 163 times
Juliet speaks: 117 times


## Integral Approximation

In [17]:
import numpy as np

# Parameters
a, b = 0, 10
n = 1000  # Adjust n for precision
delta_x = (b - a) / n


# Define function
def f(x):
    return x**2 - 3


# Create RDD with equally spaced x values
x_values = sc.parallelize(np.linspace(a, b, n))

# Calculate integral approximation
integral_approximation = x_values.map(lambda x: f(x) * delta_x).sum()
print("Approximation of the integral:", integral_approximation)

Approximation of the integral: 303.50016683350015
