In [None]:
!pip install pyspark



In [None]:
#Setting spark configuration and spark context
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("First App").setMaster("local")
sc = SparkContext(conf = conf)

In [None]:
conf, sc

(<pyspark.conf.SparkConf at 0x787373efb050>,
 <SparkContext master=local appName=First App>)

In [None]:
#Defining the data and converting it to an RDD
data = [1,2,4,5,6,8,10]
rdd = sc.parallelize(data)

In [None]:
#Performing square operation on the data
fn = lambda x:x**2
sq = rdd.map(fn)
sq.collect()

[1, 4, 16, 25, 36, 64, 100]

In [None]:
#Performing filter operation on the data
fil = rdd.filter(lambda x:x%2==0)
fil.collect(), fil.count()
sc.stop()

### Word Count

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("Word Count").setMaster("local")
sc = SparkContext(conf = conf)

In [None]:
sc

In [None]:
data = ["Great Wilbraham is a Neolithic causewayed enclosure, an archaeological site near the village of Great Wilbraham in Cambridgeshire, England.",
        "The enclosure is about 170 metres (560 ft) across, and covers about 2 hectares (4.9 acres).",
        "Causewayed enclosures were built in England from before 3700 BC until c. 3500 BC;",
        "they are characterized by the enclosure of an area with ditches that are interrupted by gaps, or causeways.","Their purpose is not known;","they may have been settlements, meeting places, or ritual sites.","The Great Wilbraham enclosure was first identified from aerial photographs in 1972.",
        "An excavation was begun in 1975 by David Clarke, with a planned five-year research programme, but he died in 1976 and his results remained unpublished.",
        "His archive of finds and records was reanalysed in the 2000s.",
        "The site was rich in finds, including Neolithic flint, pottery from periods from the Neolithic to the present day, and animal bone.",
        "The site has been protected as a scheduled monument since 1976."]

In [None]:
#Creating an RDD from the above defined data
rdd = sc.parallelize(data)

In [None]:
#Performing word count by the following steps:
#1. Separating all the words and creating a single list containing them all.
#2. Created a key value pair, where each word is a key and the value is 1
#3. Grouping the key-value pairs by the key (word), and adding all the values (count)
words_rdd = rdd.flatMap(lambda sentence: sentence.split(" "))
word_pairs_rdd = words_rdd.map(lambda word: (word,1))
word_count = word_pairs_rdd.reduceByKey(lambda a,b:a+b)

wc = word_count.collect()

In [None]:
for word, count in wc:
  print(f"{word}: {count}")

Great: 3
Wilbraham: 3
is: 3
a: 3
Neolithic: 3
causewayed: 1
enclosure,: 1
an: 2
archaeological: 1
site: 3
near: 1
the: 5
village: 1
of: 3
in: 7
Cambridgeshire,: 1
England.: 1
The: 4
enclosure: 3
about: 2
170: 1
metres: 1
(560: 1
ft): 1
across,: 1
and: 4
covers: 1
2: 1
hectares: 1
(4.9: 1
acres).: 1
Causewayed: 1
enclosures: 1
were: 1
built: 1
England: 1
from: 4
before: 1
3700: 1
BC: 1
until: 1
c.: 1
3500: 1
BC;: 1
they: 2
are: 2
characterized: 1
by: 3
area: 1
with: 2
ditches: 1
that: 1
interrupted: 1
gaps,: 1
or: 2
causeways.: 1
Their: 1
purpose: 1
not: 1
known;: 1
may: 1
have: 1
been: 2
settlements,: 1
meeting: 1
places,: 1
ritual: 1
sites.: 1
was: 4
first: 1
identified: 1
aerial: 1
photographs: 1
1972.: 1
An: 1
excavation: 1
begun: 1
1975: 1
David: 1
Clarke,: 1
planned: 1
five-year: 1
research: 1
programme,: 1
but: 1
he: 1
died: 1
1976: 1
his: 1
results: 1
remained: 1
unpublished.: 1
His: 1
archive: 1
finds: 1
records: 1
reanalysed: 1
2000s.: 1
rich: 1
finds,: 1
including: 1
flint,: 

In [None]:
sc.stop()

### Temperature Conversion

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("Temperature Conversion").setMaster("local")
sc = SparkContext(conf = conf)

In [None]:
sc

In [None]:
data = [0,20,30,40,50,100, -25]

In [None]:
rdd = sc.parallelize(data)

In [None]:
#Applying the map function on the RDD to convert the temperature from celsius to fahrenheit
rdd2 = rdd.map(lambda x: (x, (x*9/5) + 32))
res = rdd2.collect()

In [None]:
for t,f in res:
  print(f"{t}C: {f}F")

0C: 32.0F
20C: 68.0F
30C: 86.0F
40C: 104.0F
50C: 122.0F
100C: 212.0F
-25C: -13.0F


In [None]:
sc.stop()