##     Analyzing Bigrams in Pyspark

In [1]:
spark

### Create spark context

In [2]:
sc = spark.sparkContext

### Read the input file

In [3]:
lines = sc.textFile("Filepath\\Text2.txt")
lines.take(5)

['The Project Gutenberg EBook of Democracy In America, Volume 1 (of 2), by ',
 'Alexis de Toqueville',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or']

###  Define function cleandata for following:

1. Convert all words to lowercase

2. Remove all double quotes 

3. Special characters{",", ".", ";", "?", ":"} to be ignored and removed 
   from the END of the words 
   
4. If a word ends with "'s", then remove "'s" from that word.
   
5. If a word begins with "*", then delele that word 

In [4]:
def cleandata(y):
    lst =[]
    
    for x in y.split():
        x= x.strip().lower().replace('"','')
        char_list = [",", ".", ";", "?", ":"]
        if  len(x) > 0 and x[-1] in char_list:
            x = x[:-1]  
        if len(x)>1 and x[-2:]=="'s":
            x= x[:-2] 
        if len(x)>0 and x[0] =="*":
            x = '' 
        lst.append(x)
     
    return ' '.join(lst)

### Apply function cleandata and return clean_lines rdd 

In [5]:
clean_lines = lines.map(lambda x: cleandata(x))
clean_lines.take(5)

['the project gutenberg ebook of democracy in america volume 1 (of 2) by',
 'alexis de toqueville',
 '',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever you may copy it give it away or']

### Create final_lines rdd to remove empty spaces from clean_lines rdd

In [6]:
final_lines = clean_lines.filter(lambda x: x.strip()!='')
final_lines.take(5)

['the project gutenberg ebook of democracy in america volume 1 (of 2) by',
 'alexis de toqueville',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever you may copy it give it away or',
 're-use it under the terms of the project gutenberg license included']

### Define function bigram

In [7]:
def bigram(x):
    for i in range(0,len(x)-1):
        yield (x[i], x[i+1])

### create bigram_words rdd by applying bigram function

In [8]:
bigram_words = final_lines.map(lambda x: x.split()).flatMap(lambda x:bigram(x))
bigram_words.take(5)

[('the', 'project'),
 ('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', 'of'),
 ('of', 'democracy')]

### Generate key value pairs with each bigram as key and 1 as value

In [9]:
pairs =bigram_words.map(lambda x: (x, 1))
pairs.take(5)

[(('the', 'project'), 1),
 (('project', 'gutenberg'), 1),
 (('gutenberg', 'ebook'), 1),
 (('ebook', 'of'), 1),
 (('of', 'democracy'), 1)]

### Create freq rdd from pairs rdd with reducebykey method to get key(bigram_words), value(frequency) pair

In [10]:
freq =pairs.reduceByKey(lambda x, y:x+y)
freq.take(5)

[(('project', 'gutenberg'), 25),
 (('gutenberg', 'ebook'), 4),
 (('ebook', 'of'), 2),
 (('in', 'america'), 192),
 (('(of', '2)'), 3)]

###  1. No of  unique bigrams

In [11]:
no_unique_bigrams = freq.count()
print(no_unique_bigrams)

71033


### 2. List the top ten most frequent bigrams and their counts
Create sorted_bigrams with sortBy method to get key(bigram_words), value(frequency) pair in Descending order

In [12]:
sorted_bigrams = freq.sortBy(lambda x :x[1], ascending=False)

In [13]:
top10_bigrams =sc.parallelize(sorted_bigrams.take(10))
top10_bigrams.collect()

[(('of', 'the'), 3916),
 (('in', 'the'), 1540),
 (('to', 'the'), 1035),
 (('and', 'the'), 783),
 (('it', 'is'), 608),
 (('by', 'the'), 520),
 (('the', 'united'), 476),
 (('of', 'a'), 459),
 (('united', 'states'), 451),
 (('to', 'be'), 440)]

### 3. Cumulative frequency of the top ten bigrams

In [14]:
sum_top10_bigrams = top10_bigrams.values().sum()
total_bigrams =sorted_bigrams.values().sum()
freq_top10_bigrams = float(sum_top10_bigrams)/total_bigrams
print(freq_top10_bigrams)

0.058128498763888495


### 4.  Number of bigrams that appear only once

In [15]:
freq_one = freq.filter(lambda x: x[1] == 1).count()
print(freq_one)

51947
