# Creating RDDs

In [None]:
#os.getenv('PYSPARK_SUBMIT_ARGS')

In [None]:
import os
#This line is meant to set the JAVA_HOME environment variable to the path of Java version 1.8. This is necessary because Spark requires Java, and you need to ensure the correct version of Java is used.
#The ! before the command indicates that this line is a shell command being run from within the Python script.
!export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)
#SparkConf is used to configure the Spark application.
#SparkContext is the entry point to any Spark functionality. It's needed to create RDDs (Resilient Distributed Datasets) and to execute operations.
from pyspark import SparkConf, SparkContext

#os.environ['PYSPARK_PYTHON'] = '/Library/Frameworks/Python.framework/Versions/3.6/bin/python3'
#os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3'
os.environ['PYSPARK_PYTHON'] = '/usr/local/Cellar/python@3.10/3.10.9/bin/python3'
#os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/local/bin/python3'

#SparkSession is the entry point to programming with the Spark SQL module, and it allows you to work with DataFrames and SQL queries.
#master("local") indicates that the Spark application will run locally on your machine, using one thread.
#appName("rdd_demo") sets the name of your Spark application to "rdd_demo".
#getOrCreate() either retrieves an existing Spark session or creates a new one.

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("rdd_demo").getOrCreate()
#spark = SparkSession.builder.master("local[1]").appName("rdd_demo").getOrCreate()
#spark = SparkSession.builder.master("local").config("spark.driver.bindAddress", "127.0.0.1").appName("rdd_demo").getOrCreate()
#spark = SparkSession.builder.master("local").config("spark.ui.port", "4041").appName("rdd_demo").getOrCreate()
#spark = SparkSession.builder.master("local").config("spark.ui.reverseProxy", "true").appName("rdd_demo").getOrCreate()


#sc.stop()
#SparkConf() is used to configure the SparkContext. In this case, it's set to run locally with the application name "MinTemperatures".
#SparkContext.getOrCreate() either retrieves an existing SparkContext or creates a new one.
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext.getOrCreate()

24/07/04 14:45:21 WARN Utils: Your hostname, Akashs-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.55.206 instead (on interface en0)
24/07/04 14:45:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/04 14:45:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
spark

In [None]:
# 1. flightData2015 = spark\
# This starts the process of reading data into a DataFrame. The spark object refers to the SparkSession created earlier.
# flightData2015 is the variable that will hold the resulting DataFrame.
# 2. .read\
# This accesses the DataFrameReader object, which is used to read data into a DataFrame. It allows you to specify various options related to the format, schema, and source of the data.
# 3. .option("inferSchema", "false")\
# The .option() method allows you to set specific options for how Spark reads the data.
# "inferSchema", "false" specifies that Spark should not automatically infer the schema of the data. Instead, it will treat all columns as strings (text) unless a schema is provided.
# If "inferSchema" were set to "true", Spark would try to infer the data types of each column based on the data.
# 4. .option("header", "true")\
# This option indicates that the first row of the CSV file contains the column names (headers).
# If "header" were set to "false", Spark would treat the first row as regular data instead of column headers.
# 5. .csv("/Users/aakash/Downloads/spark-code/data/flight-data/csv/2015-summary.csv")
# This line specifies the path to the CSV file that you want to read into the DataFrame.
# The csv() method is used to read a CSV file into a DataFrame.
flightData2015 = spark\
  .read\
  .option("inferSchema", "false")\
  .option("header", "true")\
  .csv("/Users/aakash/Downloads/spark-code/data/flight-data/csv/2015-summary.csv")

In [None]:
print(flightData2015)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]


In [None]:
flightData2015

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]

In [None]:
type(flightData2015)

pyspark.sql.dataframe.DataFrame

In [None]:
##### read using datasource API


<class 'pyspark.rdd.RDD'>


                                                                                

[Row(dest='United States', source='Romania', count='15')]
256


In [None]:
##### read using sparkcontext
# 1. spth="/Users/aakash/Downloads/spark-code/data/flight-data/csv/2015-summary.csv"
# This line defines a string variable spth that holds the path to the CSV file you want to read.
# 2. sc_flightData2015 = spark.sparkContext.textFile(spth)
# This line reads the CSV file into an RDD (Resilient Distributed Dataset) using the SparkContext's textFile method.
# spark.sparkContext gives you access to the SparkContext from the SparkSession.
# The textFile(spth) method reads the file specified by spth into an RDD. Each line of the file becomes a single element in the RDD.
# 3. print(type(sc_flightData2015))
# This prints the type of sc_flightData2015.
# Since sc_flightData2015 is an RDD, the output will be <class 'pyspark.rdd.RDD'>, indicating that the data is stored as an RDD.
# 4. print(sc_flightData2015.take(2))
# The take(2) method retrieves the first 2 elements (lines) of the RDD sc_flightData2015 and returns them as a list.
# print(sc_flightData2015.take(2)) will print the first 2 lines of the CSV file as strings in a list.
spth="/Users/aakash/Downloads/spark-code/data/flight-data/csv/2015-summary.csv"
sc_flightData2015=spark.sparkContext.textFile(spth)
print(type(sc_flightData2015))
print(sc_flightData2015.take(2))

<class 'pyspark.rdd.RDD'>
['DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count', 'United States,Romania,15']


In [None]:
##### convert pandas file to RDD
#The pandas library is imported with the alias pd. Pandas is a powerful library for data manipulation and analysis in Python.
import pandas as pd
#The variable spth holds the path to the CSV file that you want to read.
spth="/Users/aakash/Downloads/spark-code/data/flight-data/csv/2015-summary.csv"
#pd.read_csv(spth, header=0) reads the CSV file at the specified path into a Pandas DataFrame named pd_flightData2015.
header=0 tells Pandas to use the first row of the CSV file as the column names.
#This prints the type of pd_flightData2015, which will be <class 'pandas.core.frame.DataFrame'> because it is a Pandas DataFrame.
pd_flightData2015=pd.read_csv(spth, header=0)
#pd_flightData2015.head() returns the first five rows of the DataFrame, providing a quick look at the data.
print(type(pd_flightData2015))
#spark.createDataFrame(pd_flightData2015) converts the Pandas DataFrame pd_flightData2015 into a Spark DataFrame.
.rdd converts the resulting Spark DataFrame into an RDD, which is assigned back to pd_flightData2015.
#This prints the type of pd_flightData2015, which will now be <class 'pyspark.rdd.RDD'>, indicating that it has been converted into an RDD.
print(pd_flightData2015.head())
#pd_flightData2015.take(1) retrieves the first element from the RDD and prints it. The output will be a list containing the first row of the data, represented as a tuple.
pd_flightData2015=spark.createDataFrame(pd_flightData2015).rdd
print(type(pd_flightData2015))
print(pd_flightData2015.take(1))

<class 'pandas.core.frame.DataFrame'>
  DEST_COUNTRY_NAME ORIGIN_COUNTRY_NAME  count
0     United States             Romania     15
1     United States             Croatia      1
2     United States             Ireland    344
3             Egypt       United States     15
4     United States               India     62
<class 'pyspark.rdd.RDD'>
[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)]


In [None]:
#The string "Spark The Definitive Guide : Big Data Processing Made Simple" is a sentence.
# .split(" ") splits this string into a list of words based on spaces as the delimiter.
# myCollection becomes a list of words: ['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple'].
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
#   spark.sparkContext.parallelize(myCollection, 2) creates an RDD from the list myCollection.
# parallelize() is a method used to distribute the elements of the list across a specified number of partitions.
# The second argument 2 specifies the number of partitions to divide the data into. In this case, the RDD will be split into 2 partitions.
words = spark.sparkContext.parallelize(myCollection, 2)
# words.take(10) retrieves the first 10 elements of the RDD and returns them as a list.
# Since the original list myCollection has 10 elements, this will return the entire list: ['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple'].
words.take(10)

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [None]:
##### from a collection of text
# The string "Spark The Definitive Guide : Big Data Processing Made Simple" is split into a list of words based on spaces.
# myCollection becomes a list: ['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple'].
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
# spark.sparkContext.parallelize(myCollection, 2) creates an RDD from the myCollection list.
# The second argument 2 specifies that the RDD should be split into 2 partitions, enabling parallel processing.
words = spark.sparkContext.parallelize(myCollection, 2)
# setName("myWords") assigns the name "myWords" to the words RDD.
# Naming an RDD can be helpful for debugging and understanding the lineage of RDDs when viewing the Spark UI or logs.
words.setName("myWords")
print(words.name()) # myWords
print(type(words))
print(words.take(5))
# words.getNumPartitions() returns the number of partitions in the RDD.
# Since the RDD was created with 2 partitions, this method will return 2
words.getNumPartitions()

myWords
<class 'pyspark.rdd.RDD'>
['Spark', 'The', 'Definitive', 'Guide', ':']


2

In [None]:
### from a range of numbers
myRange = spark.range(1000000000000).toDF("number").rdd.map(lambda row: row[0])
#myRange = spark.range(1000).toDF("number")
print(myRange.take(5))
print(type(myRange))
myRange.getNumPartitions()

[0, 1, 2, 3, 4]
<class 'pyspark.rdd.PipelinedRDD'>


1

# filtering

In [1]:
# date = fields[0] assigns the first element of the list (fields[0]) to the variable date. In the example, this would be '2023-08-11'.
# p_open = fields[1] assigns the second element of the list (fields[1]) to the variable p_open, representing the opening price. In the example, this would be '100.5'.
# p_close = fields[5] assigns the sixth element of the list (fields[5]) to the variable p_close, representing the closing price. In the example, this would be '101.0'.
def parseLine(line):
    fields = line.split(',')
    date = fields[0]
    p_open = fields[1]
    p_close = fields[5]
    return (date, p_open, p_close)

In [None]:
spth="/Users/aakash/Downloads/spark-code/data/RELIANCE.csv"
sdt=spark.sparkContext.textFile(spth)
sdt=sdt.map(parseLine)
sdt.take(2)

[('Date', 'Open', 'Close'), ('2022-01-01', '100', '10000')]

In [None]:
spth="/Users/aakash/Downloads/spark-code/data/RELIANCE.csv"
# spark.read.format("CSV") specifies that the data format is CSV.
# .option("header", "true") tells Spark that the first row of the CSV file contains headers (column names).
# .option("inferSchema", "true") enables automatic inference of the data types for each column.
# .load(spth) loads the data from the specified file path into a DataFrame o_sdt.
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true") \
    .load(spth)

#     .toDF("Date", "Open", "High", "Low", "Last", "Close", "Volume", "Turnover") renames the columns of the DataFrame with the specified names.
# .rdd converts the DataFrame into an RDD.
# .map(lambda row: (row[0], row[1], row[5])) transforms each row of the RDD. It creates a new RDD where each row is a tuple containing only the Date, Open, and Close columns (corresponding to row[0], row[1], and row[5] respectively).
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd \
    .map(lambda row: (row[0], row[1], row[5]))

  #o_sdt.count() counts the number of rows (tuples) in the RDD and prints the result
print(o_sdt.count())
print(type(o_sdt))
#print(type(o_sdt)) outputs the type of o_sdt, which is <class 'pyspark.rdd.RDD'> since it was converted to an RDD.
print(o_sdt.take(2))

#.filter(lambda row: row[2] > row[1]) filters the RDD to include only rows where the Close price (row[2]) is greater than the Open price (row[1]).
#The filtered RDD is then stored back in o_sdt.
o_sdt=o_sdt.filter(lambda row: row[2] > row[1])
print(o_sdt.take(5))
print(type(o_sdt))
print(o_sdt.count())

[Stage 10:>                                                         (0 + 1) / 1]                                                                                

5
<class 'pyspark.rdd.PipelinedRDD'>
[(datetime.date(2022, 1, 1), 100, 10000), (datetime.date(2022, 1, 2), 200, 20000)]
[(datetime.date(2022, 1, 1), 100, 10000), (datetime.date(2022, 1, 2), 200, 20000), (datetime.date(2022, 1, 3), 300, 30000), (datetime.date(2022, 1, 4), 400, 40000), (datetime.date(2022, 1, 5), 500, 50000)]
<class 'pyspark.rdd.PipelinedRDD'>
5


## filter function

In [None]:
def HighClose(row):
    ## Discuss
    if row[2] > row[1]:
        return(row)

Notice anything in output below?

In [None]:
# spark.read.format("CSV"): Specifies that the file format is CSV.
# .option("header", "true"): Indicates that the first row of the CSV file contains the header (column names).
# .load(spth): Loads the data from the specified file path spth into a DataFrame o_sdt.
o_sdt = spark.read.format("CSV").option("header","true").load(spth)
# .toDF("Date", "Open", "High", "Low", "Last", "Close", "Volume", "Turnover"): Renames the columns of the DataFrame.
# .rdd: Converts the DataFrame into an RDD.
# .map(lambda row: (row[0], row[1], row[5])): Transforms each row of the RDD into a tuple containing only the Date, Open, and Close columns (corresponding to row[0], row[1], and row[5]).
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
# .filter(lambda row: HighClose(row)): Filters the RDD using the HighClose function, which is applied to each row.
# The HighClose function (not defined in the provided code) is expected to take a row (tuple) as input and return True or False based on some condition.
# The RDD o_sdt is updated to only include rows where HighClose(row) returns True.
o_sdt=o_sdt.filter(lambda row: HighClose(row))
print(o_sdt.take(5))
print(type(o_sdt))
print(o_sdt.count())

5
[('2022-01-01', '100', '10000'), ('2022-01-02', '200', '20000'), ('2022-01-03', '300', '30000'), ('2022-01-04', '400', '40000'), ('2022-01-05', '500', '50000')]
<class 'pyspark.rdd.PipelinedRDD'>
5


How about now?

In [None]:
# spark.read.format("CSV"): Specifies that the file format is CSV.
# .option("header", "true"): Indicates that the first row of the CSV file contains the headers (column names).
# .option("inferSchema", "true"): Automatically infers the schema of each column, determining the data types based on the content.
# .load(spth): Loads the data from the specified file path spth into a DataFrame o_sdt.
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true").load(spth)
# .toDF("Date", "Open", "High", "Low", "Last", "Close", "Volume", "Turnover"): Renames the columns of the DataFrame to the specified names.
# .rdd: Converts the DataFrame into an RDD (Resilient Distributed Dataset).
# .map(lambda row: (row[0], row[1], row[5])): Applies a transformation to each row of the RDD. It creates a new RDD where each row is a tuple containing only the Date, Open, and Close columns:
# row[0] corresponds to the Date.
# row[1] corresponds to the Open price.
# row[5] corresponds to the Close price.
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
# .filter(lambda row: HighClose(row)): Filters the RDD by applying the HighClose function to each row.
# The HighClose function (not provided in the code) is expected to take a row (tuple) as input and return True or False based on some condition.
# Only rows where HighClose(row) returns True are retained in the RDD.
o_sdt=o_sdt.filter(lambda row: HighClose(row))
print(o_sdt.take(5))
print(type(o_sdt))
print(o_sdt.count())


# Maps

In [None]:
def to_to_mill(row):
    return (row[0], row[1], row[2], round(row[3],0))

In [None]:
spth="/Users/aakash/Downloads/spark-code/data/RELIANCE.csv"
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true").load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5], row[7]))
print(o_sdt.take(2))
o_sdt=o_sdt.map(to_to_mill)
print(type(o_sdt))

[(datetime.date(2022, 1, 1), 100, 10000, None), (datetime.date(2022, 1, 2), 200, 20000, None)]
<class 'pyspark.rdd.PipelinedRDD'>


# flatMap

In [None]:
def Func(lines):
    lines = lines.lower()
    lines = lines.split(" ")
    return lines

#sc.stop()
#conf = SparkConf().setMaster("local").setAppName("wordcount")
#sc = SparkContext.getOrCreate()

spth="/Users/aakash/Downloads/spark-code/data/sherlock_holmes.txt"
input_file = sc.textFile(spth)

In [None]:
print(input_file.take(1))

['A SCANDAL IN BOHEMIA']


In [None]:
rdd1 = input_file.flatMap(Func)

In [None]:
rdd1.take(5)

['a', 'scandal', 'in', 'bohemia', 'i.']

In [None]:
rdd1.map(lambda x: (x,1)).take(5)

[('a', 1), ('scandal', 1), ('in', 1), ('bohemia', 1), ('i.', 1)]

In [None]:
# rdd1: This is the original RDD containing elements to be processed.
# .map(lambda x: (x, 1)): This transformation maps each element x of rdd1 to a tuple (x, 1).
# This essentially creates a new RDD where each element is a tuple, with the original element as the key and 1 as the value.
# Example: If rdd1 contains [a, b, a, c], the resulting RDD will be [(a, 1), (b, 1), (a, 1), (c, 1)].
rdd2=rdd1 \
.map(lambda x: (x,1)) \

# .groupByKey(): Groups the tuples by their key (i.e., the original element) and aggregates the values for each key.
# After this step, each key (original element) will have an iterable of values associated with it. For example, [(a, [1, 1]), (b, [1]), (c, [1])].
.groupByKey() \
.mapValues(sum) \
.map(lambda x: (x[1],x[0])) \
.sortByKey(False)

rdd2.take(5)

                                                                                

[(556, 'the'), (321, ''), (305, 'and'), (302, 'of'), (299, 'to')]

## Reduce

In [None]:
spark.sparkContext.parallelize(range(1,5)).reduce(lambda x, y: x+y)

In [None]:
Reduce works on 2 consecutive inputs
and performs incremental operations on them

Input: [1,2,3,4]
Reduce step 1 -
    Input: 1,2
    Output: 1+2

Reduce step 2 -
    Input: 3
    Output: 3+3= 6

Reduce step 3 -
    Input: 4
    Output: 6+4 = 10