### Import the required libraries then Create SparkContext

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 64 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 66.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=02d74c7acd672b9e09c2313b200f040564b5d12d6f80041dd292e2680b315637
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()

### Create and display an RDD from the following list

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [None]:
rdd = sc.parallelize(list)
rdd.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Read sample1.txt file into RDD and displaying the first 4 elements

In [None]:
txtfile = sc.textFile('sample1.txt')
txtfile

sample1.txt MapPartitionsRDD[19] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
txtfile.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? ',
 'Duo Reges: constructio interrete. ']

### Count the total number of rows in RDD

In [None]:
txtfile.count()

7

### Create a function to convert the data into lower case and splitting it

In [None]:
txtfile.map(lambda x:x.lower()).collect()

['utilitatis causa amicitia est quaesita.',
 'lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'collatio igitur ista te nihil iuvat. honesta oratio, socratica, platonis etiam. primum in nostrane potestate est, quid meminerimus? ',
 'duo reges: constructio interrete. ',
 'quid, si etiam iucunda memoria est praeteritorum malorum? si quidem, inquit, tollerem, sed relinquo. an nisi populari fama?',
 '',
 'quamquam id quidem licebit iis existimare, qui legerint. summum a vobis bonum voluptas dicitur. at hoc in eo m. refert tamen, quo modo. quid sequatur, quid repugnet, vident. iam id ipsum absurdum, maximum malum neglegi.']

In [None]:
txtfile.map(lambda x:x.split()).collect()

[['Utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['Lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['Collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'Honesta',
  'oratio,',
  'Socratica,',
  'Platonis',
  'etiam.',
  'Primum',
  'in',
  'nostrane',
  'potestate',
  'est,',
  'quid',
  'meminerimus?'],
 ['Duo', 'Reges:', 'constructio', 'interrete.'],
 ['Quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'Si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.',
  'An',
  'nisi',
  'populari',
  'fama?'],
 [],
 ['Quamquam',
  'id',
  'quidem',
  'licebit',
  'iis',
  'existimare,',
  'qui',
  'legerint.',
  'Summum',
  'a',
  'vobis',
  'bonum',
  'voluptas',
  'dicitur.',
  'At',
  'hoc',
  'in',
  'eo',
  'M.',
  'Refert',
  'tamen,',
  'quo',
  'modo.',
  'Quid',
  'sequatur,',
  'quid',
  'repugnet,',
  'vident.',
  'Iam',
  'id',
  'ipsum',
  'absurdu

### Filter the stopwords from the previous text

In [None]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [None]:
newfile = txtfile.filter(lambda x : x not in stopwords)
newfile.flatMap(lambda x:x.split()).collect()

['Utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'Lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'Collatio',
 'igitur',
 'ista',
 'te',
 'nihil',
 'iuvat.',
 'Honesta',
 'oratio,',
 'Socratica,',
 'Platonis',
 'etiam.',
 'Primum',
 'in',
 'nostrane',
 'potestate',
 'est,',
 'quid',
 'meminerimus?',
 'Duo',
 'Reges:',
 'constructio',
 'interrete.',
 'Quid,',
 'si',
 'etiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'Si',
 'quidem,',
 'inquit,',
 'tollerem,',
 'sed',
 'relinquo.',
 'An',
 'nisi',
 'populari',
 'fama?',
 'Quamquam',
 'id',
 'quidem',
 'licebit',
 'iis',
 'existimare,',
 'qui',
 'legerint.',
 'Summum',
 'a',
 'vobis',
 'bonum',
 'voluptas',
 'dicitur.',
 'At',
 'hoc',
 'in',
 'eo',
 'M.',
 'Refert',
 'tamen,',
 'quo',
 'modo.',
 'Quid',
 'sequatur,',
 'quid',
 'repugnet,',
 'vident.',
 'Iam',
 'id',
 'ipsum',
 'absurdum,',
 'maximum',
 'malum',
 'neglegi.']

### Filter the words starting with ‘c’

In [None]:
splitted = newfile.flatMap(lambda x:x.split()).filter(lambda x :x.startswith('c')).collect() 

In [None]:
splitted

['causa', 'consectetur', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [None]:
rdd = sc.parallelize(list)
rdd.reduceByKey(lambda a,b : (a+b)).collect()

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

### Creat some key value pairs RDDs

In [None]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])


### Perform Join operation on the RDDs (rdd1,rdd2)

In [None]:
rdd1.join(rdd2).collect()

[('b', (3, 7)), ('a', (2, 9))]