<a href="https://colab.research.google.com/github/Sujata018/NLP/blob/main/FindPhrases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk.corpus
from nltk.collocations import *

In [None]:
nltk.download('abc')              # download corpus abc
nltk.corpus.abc.words()

[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Unzipping corpora/abc.zip.


['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', ...]

In [None]:
'''
Finds phrases with two words, based on frequency of occurring and pmi.

PMI(x,y) is the conditional probability of x occurring after y, given y has already occurred.

'''
bigram_measures = nltk.collocations.BigramAssocMeasures()            # for calculating pmi (Pointwise Mutual Information)

bgFinder=BigramCollocationFinder.from_words(nltk.corpus.abc.words()) # find bigrams
bgFinder.apply_freq_filter(500)                                      # discard bigrams that appear < 500 times
bgFinder.nbest(bigram_measures.pmi, 10)                              # display 10 bigrams with the best pmi scores

'''
It lists down all stopwords like ".","'","," etc. Only valid bigrams are 'per cent', 'he said', 'has been', 'have been' in the output.
'''

[('per', 'cent'),
 ('he', 'said'),
 (',"', 'he'),
 ('"', 'We'),
 ('has', 'been'),
 ("'", 's'),
 ("'", 've'),
 ("'", 't'),
 ("'", 're'),
 ('have', 'been')]

In [None]:
nltk.download('stopwords')            # Download stopwords, so they can be excluded

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
'''
same code as above, excluding the stop word as well. 
Frequency filter is rediced from 500 to 50, as number of bigrams reduced drastically 
after discarding stopwords.
'''
bigram_measures = nltk.collocations.BigramAssocMeasures()

bgFinder=BigramCollocationFinder.from_words(nltk.corpus.abc.words())
bgFinder.apply_freq_filter(50)   # discard bigrams that appeared < 50 times in corpus                                    

ignored_words = nltk.corpus.stopwords.words('english') # get list of stopwords in English language
bgFinder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) # words with lenght < 3 and the stopwords are discarded (this excludes 'I', 'we' etc.)
bgFinder.nbest(bigram_measures.pmi, 10)

'''
The output bigrams are valid phrases.
So, next, extend the length of the phrases to 3, 4 etc.
'''


[('Primary', 'Industries'),
 ('United', 'States'),
 ('Northern', 'Territory'),
 ('single', 'desk'),
 ('chief', 'executive'),
 ('Prime', 'Minister'),
 ('Farmers', 'Federation'),
 ('Cole', 'inquiry'),
 ('Peter', 'McGauran'),
 ('journal', 'Nature')]

In [None]:
from operator import itemgetter
from nltk.metrics.association import QuadgramAssocMeasures

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = QuadgramAssocMeasures()

bgFinder=BigramCollocationFinder.from_words(nltk.corpus.abc.words())
bgFinder.apply_freq_filter(50)   # discard bigrams that appeared < 50 times in corpus                                    

tgFinder=TrigramCollocationFinder.from_words(nltk.corpus.abc.words())
tgFinder.apply_freq_filter(30)   # discard trigrams that appeared < 30 times in corpus                                    

fgFinder=QuadgramCollocationFinder.from_words(nltk.corpus.abc.words())
fgFinder.apply_freq_filter(10)   # discard quadgrams that appeared < 10 times in corpus                                    

ignored_words = nltk.corpus.stopwords.words('english') # get list of stopwords in English language
bgFinder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) # words with lenght < 3 and the stopwords are discarded (this excludes 'I', 'we' etc.)
tgFinder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) # words with lenght < 3 and the stopwords are discarded (this excludes 'I', 'we' etc.)
fgFinder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) # words with lenght < 3 and the stopwords are discarded (this excludes 'I', 'we' etc.)

d=[(a,b) for a,b in bgFinder.ngram_fd.items() or tgFinder.ngram_fd.items() or fgFinder.ngram_fd.items() ] # concatenate all phrases in a single list
unique_phrases=[]      # This list will have unique phrases, e.g. 'South Wales', and 'New South Wales' are both identified, 'South Wales' will be discarded

for i in range(len(d)): # check all phrases
  if d[i][0] not in list( map(itemgetter(0), d[i+1:] )): # if the phrase is not part of any other phrases, then identify as unique 
    unique_phrases.append(d[i])
print(sorted(unique_phrases, key=lambda t: (-t[1], t[0]))[:20])

'''
20 most frequent valid phrases identified, with two, three or four words
'''


[(('per', 'cent'), 555), (('New', 'South'), 421), (('South', 'Wales'), 421), (('Federal', 'Government'), 322), (('years', 'ago'), 283), (('Western', 'Australia'), 231), (('South', 'Australia'), 211), (('researchers', 'say'), 211), (('New', 'Zealand'), 177), (('last', 'year'), 164), (('climate', 'change'), 150), (('million', 'years'), 150), (('single', 'desk'), 136), (('Northern', 'Territory'), 132), (('scientists', 'say'), 131), (('first', 'time'), 123), (('Farmers', 'Federation'), 98), (('journal', 'Nature'), 95), (('next', 'year'), 93), (('Association', 'says'), 91)]


'\n20 most frequent valid phrases identified, with two, three or four words\n'