# Creating frequency distributions from the BNC corpus

This notebook shows how NLTK's FrequencyDistrbution function was used with the BNC corpus to create dictionaries mapping both single word vocabulary items and MWEs to their frequencies in the BNC. These dictionaries were saved as objects with the pickle module, for use by the FrequencyFeature and POSFrequencyFeature classes.

In [None]:
import pickle
import csv
import ast
import pandas as pd
import nltk
from nltk.corpus.reader.bnc import BNCCorpusReader

In [None]:
bnc_reader = BNCCorpusReader(root="/Users/rowena/Documents/MSC/Project/BNC/2554/download/Texts/", fileids=r'[A-K]/\w*/\w*\.xml')

In [None]:
#https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
#as the frequency distribution dictionaries take so long to process, we use pickle to save these objects after creation
def save_obj(obj, name ):
    with open(r'C:/Users/rowena/Documents/MSC/Project/obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(r'C:/Users/rowena/Documents/MSC/Project/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

### Creation of Single Word Frequency Distributions From the BNC

In [None]:
#simple frequency distribution
bnc_words = bnc_reader.words() 
freqdist_bnc = nltk.FreqDist(word.lower() for word in bnc_words)

In [None]:
#POS-tagged single word frequency distribution (using BNC's own POS tags)
bnc_words_tagged = bnc_reader.tagged_words()
bnc_words_tagged_lower=[]
for item in bnc_words_tagged:
    bnc_words_tagged_lower.append((item[0].lower(), item[1]))
freqdist_bnc_tagged_lower = nltk.FreqDist(bnc_words_tagged_lower)

In [None]:
#broader POS-tagged single word frequency distribution (using my mapping BNC's POS tags to my own broader POS tags)
tag_dict = pd.read_excel('tool/files/tag_mapping.xlsx', sheet_name='Sheet2', usecols='A,B', index_col=0, header=0).to_dict()
tag_dict = tag_dict['MY CAT']
text_maptagged_entities=[]
bnc_words_tagged_lower_mapped=[]
for item in bnc_words_tagged_lower:
    bnc_words_tagged_lower_mapped.append((item[0], tag_dict[item[1][0:3]]))
freqdist_bnc_maptagged_lower = nltk.FreqDist(bnc_words_tagged_lower_mapped)

In [None]:
#NLTK POS-tagged single word frequency distribution (using NLTK's POS tagger)
bnc_sents = bnc_reader.sents() 
bnc_words_nltktagged_lower=[]
for sent in bnc_sents:
    try:
        for item in nltk.pos_tag(sent):
            bnc_words_nltktagged_lower.append((item[0].lower(), item[1]))
    except IndexError:
        pass

In [None]:
freqdist_bnc_nltktagged_lower = nltk.FreqDist(tuple(item) for item in bnc_words_nltktagged_lower_all)

### Creation of MultiWord Frequency Distribution From the BNC

In [None]:
#create a list which holds MWEs from the English wiktionary json file. This is our MWE lexicon.
import json
json_data = []
for line in open(r'C:\Users\rowena\Documents\MSC\Project\PVs_Exprs\enwikt.json', "r"):
    json_data.append(json.loads(line))
mwes = []
for line in json_data:
    mwes.append(dict(line)['words'])

In [None]:
#initialise lemmatizer and detokenizer to use in the MWE extracter
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
from nltk.tokenize.moses import MosesDetokenizer
detokenizer = MosesDetokenizer()
#helper method to find position of ngrams in sentence
#https://stackoverflow.com/questions/33393402/how-to-find-position-of-an-ngram-in-a-sentence
def ngram_index(words, ngram):
    return list(nltk.ngrams(words, len(ngram))).index(tuple(ngram))

In [None]:
#making the multiword frequency distribution

bnc_sents = bnc_reader.sents()
bnc_sents_tagged = bnc_reader.tagged_sents()

#extract mwes from text, put in found_ngrams list, and set the index to show where these occur
found_ngrams=[]
for i in range(len(bnc_sents)):
    tokenized = bnc_sents[i]
    tokenized_tagged = bnc_sents[i]
    tokenized_for_inf = bnc_sents.copy()
    sentence_index = [True] * (len(tokenized))
    
    #run through the tokenized text and change all verbs to infinitves in the inf version
    for i in range(len(tokenized_for_inf)):
        if tokenized_tagged[i][1]=='VERB':
            tokenized_for_inf[i]=lemmatizer.lemmatize(tokenized_for_inf[i], 'v')
    joined_string_orig=detokenizer.detokenize(tokenized, return_str=True)
    joined_string_inf=detokenizer.detokenize(tokenized_for_inf, return_str=True)
    
    #run through lexicon of mwes and ...
    for element in mwes:
        joined=' '.join(element)
        if joined in joined_string_orig:
            sentence_ngrams=list(nltk.ngrams(tokenized, len(element)))
            sentence_ngrams_index=list(nltk.ngrams(sentence_index, len(element)))
            #...check each one against each ngram in current sentence
            for n in range(len(sentence_ngrams)):
                if tuple(element)==sentence_ngrams[n] and True in sentence_ngrams_index[n]:
                    found_ngrams.append(sentence_ngrams[n])
                    ngram_length=len(sentence_ngrams[n])
                    for q in range(ngram_length):
                        sentence_index[q+(ngram_index(tokenized,sentence_ngrams[n]))]=False
                    break
        elif joined in joined_string_inf:
            sentence_ngrams=list(nltk.ngrams(tokenized_for_inf, len(element)))
            sentence_ngrams_index=list(nltk.ngrams(sentence_index, len(element)))
            #...check each one against each ngram in current sentence
            for n in range(len(sentence_ngrams)):
                if tuple(element)==sentence_ngrams[n] and True in sentence_ngrams_index[n]:
                    found_ngrams.append(sentence_ngrams[n])
                    ngram_length=len(sentence_ngrams[n])
                    for q in range(ngram_length):
                        sentence_index[q+(ngram_index(tokenized_for_inf,sentence_ngrams[n]))]=False
                    break

In [None]:
#transform all found mwes in the BNC to lower case
mylistlower=[]
for sublist in found_ngrams=[]:
    mylowermwes=[]
    for i in range(len(sublist)):
        mylowermwes.append(sublist[i].lower())
    mylistlower.append(tuple(mylowermwes))

In [13]:
mwesfreqdist=nltk.FreqDist(mylistlower)

### Creation of Combined Single Word and MWE Frequency Distribution
This is converted to a dictionary object and saved with pickle for use by the FrequencyFeature class

In [None]:
combined_bnc_freqdist_dict_lower = dict(freqdist_bnc.update(mwesfreqdist))

In [None]:
save_obj(combined_bnc_freqdist_dict_lower, 'combined_bnc_freqdist_dict_lower')

### Creation of Combined POS-taggedSingle Word and MWE Frequency Distribution
This is converted to a dictionary object and saved with pickle for use by the POSFrequencyFeature class

In [None]:
tagged_mwes_dict=[]
for item in dict(mwesfreqdist):
    tagged_mwes_dict.append( ((item, 'MWE'), mwesfreqdist[item]) )

In [None]:
combined_tagged_bnc_freqdict_lower = dict(freqdist_bnc_nltktagged_lower).update(dict(tagged_mwes_dict)) 

In [None]:
save_obj(combined_tagged_bnc_freqdict_lower, 'combined_tagged_bnc_freqdict_lower')