In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
# Create an instance of SparkSession

spark = SparkSession  \
        .builder  \
        .appName('Sample Demo')  \
        .getOrCreate()

In [3]:
# Create an RDD[String], which represents all input 
# records; each record becomes an RDD element

records = spark.sparkContext.textFile("data\\sample1.txt")
records.collect()

['Red,Fox,is,fast']

In [4]:
# Convert each element of the RDD to lowercase
# x denotes a single element of the RDD
# records: source RDD[String]
# records_lowercase :  target RDD[String]

records_lowercase = records.map(lambda x : x.lower())
records_lowercase.collect()

['red,fox,is,fast']

In [5]:
# Split each record into a list of words
# records_lowercase : source RDD[String]
# words : target RDD[String]

words = records_lowercase.flatMap(lambda x: x.split(','))
words.collect()
# Here all the elements of the source RDD is first splitted by commas,
# then flattened using the flatMap transformation.

['red', 'fox', 'is', 'fast']

In [6]:
# Keep words with a length greater than 2
# x denotes a word
# words : source RDD[String]
# filtered : target RDD[String]

filtered = words.filter(lambda x: len(x) > 2)
filtered.collect()

['red', 'fox', 'fast']

In [7]:
# As you can observe, Spark transformations are high-level, powerful, and simple.
# Spark is by nature distributed and parallel: your input data is partitioned and can be
# processed by transformations (such as mappers, filters, and reducers) in parallel in a
# cluster environment. In a nutshell, to solve a data analytics problem in PySpark, you
# read data and represent it as an RDD or DataFrame (depending on the nature of the
# data format), then write a set of transformations to convert your data into the desired
# output. Spark automatically partitions your DataFrames and RDDs and distributes
# the partitions across different cluster nodes. Partitions are the basic units of parallelism
# in Spark. Parallelism is what allows developers to perform tasks on hundreds of
# computer servers in a cluster in parallel and independently. A partition in Spark is a
# chunk (a logical division) of data stored on a node in the cluster. DataFrames and
# RDDs are collections of partitions. Spark has a default data partitioner for RDDs and
# DataFrames, but you may override that partitioning with your own custom
# programming.

# Creating RDDS

In [9]:
# Method 1:

spark_session = SparkSession.builder.appName('Type1').getOrCreate()
spark_context = spark_session.sparkContext
print(spark_context)

<SparkContext master=local[*] appName=Sample Demo>


In [10]:
# Method 2:
from pyspark import SparkContext

sc = SparkContext('local', 'myapp')
print(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Sample Demo, master=local[*]) created by getOrCreate at C:\Users\Boom\AppData\Local\Temp\ipykernel_7072\1696457726.py:6 