In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=ceacb68ac3ab19e6613b1831626661f2dc67d86d97b990db707476ed616907a0
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("RDD and DataFrame Manipulation") \
    .getOrCreate()

In [4]:
rdd = spark.sparkContext.textFile("text8.unknown")

In [5]:
type(rdd)

In [6]:
import numpy as np

In [7]:
from pyspark.sql import SparkSession


# Create a SparkSession
spark = SparkSession.builder \
    .appName("RDD Task 1") \
    .getOrCreate()

# Generate 100 random numbers in the range 0 to 10 using numpy
np.random.seed(10)  # Set seed for reproducibility
random_numbers = np.random.randint(0, 11, size=100)
print(random_numbers)

# Create an RDD using the parallelize function
rdd = spark.sparkContext.parallelize(random_numbers)

# Calculate the frequency of each number (0 - 10) using appropriate function of RDD
frequency_counts = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).sortByKey()

# Collect the results
results = frequency_counts.collect()

# Print the results
print("Number\tFrequency")
for number, frequency in results:
    print(f"{number}\t{frequency}")

# Stop the SparkSession
spark.stop()

[ 9  4  0  1  9  0  1 10  8  9  0 10  8  6  4  3  0  4  6  8 10  1  8  4
  1  3  6  5  3  9  6  9  1  9  4  2  6  7  8 10  8  9  2  0  6  7  8  1
  7  1  4 10  0  8  5  4  7  8  8  2  6  2  8  8  6  6  5 10  6  0  0  6
  9  1  8 10  9  1  2  8  9  9  5  0  2  7  3  0  4  2  0  3  3  1  2  5
  9  0 10  1]
Number	Frequency
0	12
1	11
2	8
3	6
4	8
5	5
6	11
7	5
8	14
9	12
10	8


In [9]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("RDD Task 2") \
    .getOrCreate()

# Create an RDD using the text8 dataset
text_rdd = spark.sparkContext.textFile("text8.unknown")

# Split each line into words and flatten the list of words
words_rdd = text_rdd.flatMap(lambda line: line.split())

# Calculate the frequency of each word
word_frequencies = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Filter the RDD to get the frequencies of words containing the letter 'a'
words_with_a_frequencies = word_frequencies.filter(lambda pair: 'a' in pair[0])

# Collect and print the results
results = words_with_a_frequencies.collect()
for word, frequency in results:
    print(f"Word: {word}\tFrequency: {frequency}")

# Stop the SparkSession
spark.stop()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Word: mackinac	Frequency: 1
Word: tatara	Frequency: 1
Word: vasco	Frequency: 2
Word: zakim	Frequency: 7
Word: moveable	Frequency: 3
Word: viaducts	Frequency: 1
Word: metalsmithing	Frequency: 1
Word: premarked	Frequency: 1
Word: svita	Frequency: 1
Word: falkner	Frequency: 1
Word: kurna	Frequency: 1
Word: nard	Frequency: 6
Word: cooporation	Frequency: 1
Word: catan	Frequency: 2
Word: gameboard	Frequency: 2
Word: falkener	Frequency: 1
Word: mineola	Frequency: 1
Word: parlett	Frequency: 1
Word: podcast	Frequency: 2
Word: hread	Frequency: 1
Word: lampwork	Frequency: 3
Word: spacers	Frequency: 1
Word: delica	Frequency: 1
Word: beader	Frequency: 1
Word: aiko	Frequency: 1
Word: glassmakers	Frequency: 1
Word: swarovski	Frequency: 3
Word: kiffa	Frequency: 1
Word: rudraksha	Frequency: 1
Word: malas	Frequency: 1
Word: glassworking	Frequency: 1
Word: beadmaking	Frequency: 1
Word: branchiopoda	Frequency: 1
Word: daphnia	Frequency: 1
Wo

In [10]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Iris DataFrame Task") \
    .getOrCreate()

# Read the iris JSON data into a DataFrame
iris_df = spark.read.json("iris.json")

# Calculate Pearson Correlation between petalLength and petalWidth
correlation = iris_df.stat.corr("petalLength", "petalWidth")

print("Pearson Correlation between petalLength and petalWidth:", correlation)

# Show columns sepalLength, sepalWidth, and species for rows where petalLength >= 1.4
filtered_df = iris_df.filter(iris_df["petalLength"] >= 1.4).select("sepalLength", "sepalWidth", "species")
filtered_df.show()

# Stop the SparkSession
spark.stop()

Pearson Correlation between petalLength and petalWidth: 0.9626417223780231
+-----------+----------+-------+
|sepalLength|sepalWidth|species|
+-----------+----------+-------+
|        5.1|       3.5| setosa|
|        4.9|       3.0| setosa|
|        4.6|       3.1| setosa|
|        5.0|       3.6| setosa|
|        5.4|       3.9| setosa|
|        4.6|       3.4| setosa|
|        5.0|       3.4| setosa|
|        4.4|       2.9| setosa|
|        4.9|       3.1| setosa|
|        5.4|       3.7| setosa|
|        4.8|       3.4| setosa|
|        4.8|       3.0| setosa|
|        5.7|       4.4| setosa|
|        5.1|       3.5| setosa|
|        5.7|       3.8| setosa|
|        5.1|       3.8| setosa|
|        5.4|       3.4| setosa|
|        5.1|       3.7| setosa|
|        5.1|       3.3| setosa|
|        4.8|       3.4| setosa|
+-----------+----------+-------+
only showing top 20 rows

