# Get Frequency of Words using RDD

In [1]:
from operator import add 

In [3]:
# initialize spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder \
            .master("local[*]") \
            .appName("ShortNSimple") \
            .getOrCreate()

sc = spark.sparkContext

spark

Data Source: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

# Load the dataset

In [4]:
data_path = 'file:////mnt/e/Work\ \[2019-05-26\]/Tutorials/Short\ N\ Simple/datasets/amazon_cells_labelled.txt'

In [6]:
reviews_data = sc.textFile(data_path)
reviews_data.take(2)

['So there is no way for me to plug it in here in the US unless I go by a converter.\t0',
 'Good case, Excellent value.\t1']

In [7]:
'So there is no way for me to plug it in here in the US unless I go by a converter.\t0'.split('\t')

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 '0']

# Split each record using map

In [8]:
def preprocessor(line):
    sent, label = line.split('\t')
    return (sent, label)

In [9]:
sent_label_data = reviews_data.map(preprocessor)
sent_label_data.take(5)

[('So there is no way for me to plug it in here in the US unless I go by a converter.',
  '0'),
 ('Good case, Excellent value.', '1'),
 ('Great for the jawbone.', '1'),
 ('Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
  '0'),
 ('The mic is great.', '1')]

In [10]:
sent_label_data2 = reviews_data.map(lambda line: line.split('\t'))
sent_label_data2.take(5)

[['So there is no way for me to plug it in here in the US unless I go by a converter.',
  '0'],
 ['Good case, Excellent value.', '1'],
 ['Great for the jawbone.', '1'],
 ['Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
  '0'],
 ['The mic is great.', '1']]

# flatten the records using flatMap

In [11]:
bag_of_words = sent_label_data2.map(lambda row: row[0].split(' '))
print(bag_of_words.take(10))

[['So', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'US', 'unless', 'I', 'go', 'by', 'a', 'converter.'], ['Good', 'case,', 'Excellent', 'value.'], ['Great', 'for', 'the', 'jawbone.'], ['Tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutes.MAJOR', 'PROBLEMS!!'], ['The', 'mic', 'is', 'great.'], ['I', 'have', 'to', 'jiggle', 'the', 'plug', 'to', 'get', 'it', 'to', 'line', 'up', 'right', 'to', 'get', 'decent', 'volume.'], ['If', 'you', 'have', 'several', 'dozen', 'or', 'several', 'hundred', 'contacts,', 'then', 'imagine', 'the', 'fun', 'of', 'sending', 'each', 'of', 'them', 'one', 'by', 'one.'], ['If', 'you', 'are', 'Razr', 'owner...you', 'must', 'have', 'this!'], ['Needless', 'to', 'say,', 'I', 'wasted', 'my', 'money.'], ['What', 'a', 'waste', 'of', 'money', 'and', 'time!.']]


In [12]:
bag_of_words = bag_of_words.flatMap(lambda row: row)
print(bag_of_words.take(50))

['So', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'US', 'unless', 'I', 'go', 'by', 'a', 'converter.', 'Good', 'case,', 'Excellent', 'value.', 'Great', 'for', 'the', 'jawbone.', 'Tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutes.MAJOR', 'PROBLEMS!!', 'The', 'mic', 'is', 'great.', 'I', 'have', 'to', 'jiggle', 'the', 'plug']


In [13]:
bag_of_words2 = sent_label_data2.flatMap(lambda row: row[0].split(' '))
print(bag_of_words2.take(50))

['So', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'US', 'unless', 'I', 'go', 'by', 'a', 'converter.', 'Good', 'case,', 'Excellent', 'value.', 'Great', 'for', 'the', 'jawbone.', 'Tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutes.MAJOR', 'PROBLEMS!!', 'The', 'mic', 'is', 'great.', 'I', 'have', 'to', 'jiggle', 'the', 'plug']


In [17]:
d = {
    'a': 1,
    'b': 3
}
print(d.keys())
print(d.values())

dict_keys(['a', 'b'])
dict_values([1, 3])


# Group by words and get counts using reduceByKey

In [15]:
bag_of_words_w_freq = bag_of_words2.map(lambda word: (word, 1))
print(bag_of_words_w_freq.take(50))

[('So', 1), ('there', 1), ('is', 1), ('no', 1), ('way', 1), ('for', 1), ('me', 1), ('to', 1), ('plug', 1), ('it', 1), ('in', 1), ('here', 1), ('in', 1), ('the', 1), ('US', 1), ('unless', 1), ('I', 1), ('go', 1), ('by', 1), ('a', 1), ('converter.', 1), ('Good', 1), ('case,', 1), ('Excellent', 1), ('value.', 1), ('Great', 1), ('for', 1), ('the', 1), ('jawbone.', 1), ('Tied', 1), ('to', 1), ('charger', 1), ('for', 1), ('conversations', 1), ('lasting', 1), ('more', 1), ('than', 1), ('45', 1), ('minutes.MAJOR', 1), ('PROBLEMS!!', 1), ('The', 1), ('mic', 1), ('is', 1), ('great.', 1), ('I', 1), ('have', 1), ('to', 1), ('jiggle', 1), ('the', 1), ('plug', 1)]


In [20]:
word_freq = bag_of_words_w_freq.reduceByKey(add)
print(word_freq.take(50))

[('there', 11), ('is', 238), ('no', 19), ('way', 7), ('in', 84), ('go', 6), ('converter.', 1), ('Good', 13), ('case,', 4), ('value.', 3), ('Great', 30), ('jawbone.', 1), ('Tied', 1), ('lasting', 2), ('more', 13), ('than', 27), ('45', 1), ('minutes.MAJOR', 1), ('PROBLEMS!!', 1), ('The', 97), ('have', 72), ('line', 2), ('right', 8), ('decent', 2), ('several', 7), ('dozen', 1), ('contacts,', 1), ('of', 117), ('them', 9), ('are', 40), ('must', 4), ('this!', 1), ('wasted', 2), ('money.', 8), ('What', 6), ('money', 7), ('time!.', 1), ('And', 3), ('sound', 32), ('quality', 32), ('was', 87), ('very', 80), ('impressed', 8), ('when', 20), ('battery', 27), ('battery.', 3), ('two', 12), ('seperated', 1), ('5+', 1), ('started', 5)]


# Sort by frequency

In [27]:
word_freq_ordered = word_freq.sortByKey(ascending=False)
print(word_freq_ordered.take(100))

[('zero', 1), ('z500a', 1), ('your', 32), ('you.', 1), ('you,', 1), ("you'd", 1), ('you!', 2), ('you', 52), ('yet.', 1), ('yet', 1), ('yell', 1), ('years.Great', 1), ('years,', 1), ('years', 7), ('year.', 1), ('year,', 1), ('year', 5), ('wrong.First', 1), ('wrong', 2), ("wouldn't", 2), ('would.', 2), ('would', 30), ('worthwhile', 1), ('worthless.', 1), ('worthless,', 1), ('worthless', 1), ('worth', 3), ('worst', 8), ('worn-out', 1), ('works..', 1), ('works.', 1), ('works,', 1), ('works!)', 1), ('works', 28), ('working.', 1), ('working!!!!!!!!!', 1), ('working', 9), ('worked.', 2), ('worked', 15), ('work.', 7), ('work,', 2), ('work', 23), ('word', 1), ('wooden', 1), ('wood!).', 1), ('wont', 1), ('wonderfully', 1), ('wonder', 1), ("won't.", 1), ("won't", 1), ('wobbly', 1), ('without', 8), ('within', 6), ('with.', 3), ('with,', 1), ('with', 106), ('wit', 1), ('wish', 1), ('wise', 1), ('wireless', 2), ('wired', 2), ("wire's", 1), ('wiping', 1), ('winner', 1), ('window', 1), ('wind-resistan

In [30]:
word_freq_ordered = word_freq.sortBy(lambda row: row[0], ascending=False)
print(word_freq_ordered.take(100))

[('zero', 1), ('z500a', 1), ('your', 32), ('you.', 1), ('you,', 1), ("you'd", 1), ('you!', 2), ('you', 52), ('yet.', 1), ('yet', 1), ('yell', 1), ('years.Great', 1), ('years,', 1), ('years', 7), ('year.', 1), ('year,', 1), ('year', 5), ('wrong.First', 1), ('wrong', 2), ("wouldn't", 2), ('would.', 2), ('would', 30), ('worthwhile', 1), ('worthless.', 1), ('worthless,', 1), ('worthless', 1), ('worth', 3), ('worst', 8), ('worn-out', 1), ('works..', 1), ('works.', 1), ('works,', 1), ('works!)', 1), ('works', 28), ('working.', 1), ('working!!!!!!!!!', 1), ('working', 9), ('worked.', 2), ('worked', 15), ('work.', 7), ('work,', 2), ('work', 23), ('word', 1), ('wooden', 1), ('wood!).', 1), ('wont', 1), ('wonderfully', 1), ('wonder', 1), ("won't.", 1), ("won't", 1), ('wobbly', 1), ('without', 8), ('within', 6), ('with.', 3), ('with,', 1), ('with', 106), ('wit', 1), ('wish', 1), ('wise', 1), ('wireless', 2), ('wired', 2), ("wire's", 1), ('wiping', 1), ('winner', 1), ('window', 1), ('wind-resistan

In [28]:
word_freq_ordered = word_freq.sortBy(lambda row: row[1], ascending=False)
print(word_freq_ordered.take(100))

[('the', 414), ('and', 307), ('I', 303), ('is', 238), ('a', 207), ('to', 195), ('it', 176), ('this', 143), ('my', 130), ('of', 117), ('for', 114), ('phone', 110), ('with', 106), ('The', 97), ('not', 91), ('was', 87), ('on', 86), ('in', 84), ('very', 80), ('have', 72), ('that', 71), ('It', 61), ('good', 53), ('This', 53), ('you', 52), ('had', 43), ('as', 42), ('but', 41), ('are', 40), ('has', 33), ('great', 33), ('sound', 32), ('quality', 32), ('one', 32), ('from', 32), ('your', 32), ('so', 32), ('it.', 31), ('Great', 30), ('would', 30), ('like', 30), ('headset', 29), ('phone.', 28), ("I've", 28), ('works', 28), ('than', 27), ('battery', 27), ('use', 27), ('all', 27), ('be', 25), ('recommend', 24), ('product', 24), ('-', 24), ('at', 23), ('work', 23), ('or', 23), ('really', 22), ('get', 22), ('up', 21), ('when', 20), ('out', 20), ('only', 20), ('am', 20), ('product.', 20), ("I'm", 20), ('no', 19), ('ear', 19), ('me', 19), ('any', 19), ('2', 19), ('an', 18), ('can', 18), ('well', 18), ('

In [29]:
word_freq_ordered = word_freq.sortBy(lambda row: row[1], ascending=True)
print(word_freq_ordered.take(100))

[('converter.', 1), ('jawbone.', 1), ('Tied', 1), ('45', 1), ('minutes.MAJOR', 1), ('PROBLEMS!!', 1), ('dozen', 1), ('contacts,', 1), ('this!', 1), ('time!.', 1), ('seperated', 1), ('5+', 1), ('odd,', 1), ('EVERYONE', 1), ('BE', 1), ('Good!.', 1), ('clicks', 1), ('wonder', 1), ('directions,', 1), ('loved', 1), ('misleading.', 1), ('yet', 1), ('Pocket', 1), ('mobile', 1), ('couldnt', 1), ('ideal', 1), ('whose', 1), ('contract', 1), ('AC', 1), ('book', 1), ('phone.Battery', 1), ('680.', 1), ('worthless', 1), ('garbage', 1), ('mind', 1), ('ARGUING', 1), ('REGARDING', 1), ('DROPPED', 1), ('WE', 1), ('RETURNED', 1), ('TWO', 1), ('DAYS.', 1), ('Disappointed', 1), ('bulky.', 1), ('real-world', 1), ('useful', 1), ('machine', 1), ('i.e.', 1), ('DISAPPOINTED.', 1), ('regarding', 1), ('Essentially', 1), ('faceplates', 1), ('elegant', 1), ('seriously.', 1), ('clearly.', 1), ('drawback', 1), ('activated', 1), ('suddenly', 1), ('iPODs', 1), ('fairly', 1), ('hearing', 1), ('person', 1), ('advertised'