## Scrape BNC

In [None]:
from nltk.corpus.reader.bnc import BNCCorpusReader
import os
import pandas as pd
import pickle
import nltk
import statistics
import re

from nltk.tokenize import RegexpTokenizer
import string

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# these are the 4 sub-directories of BNC
subcorpora_dir = ["aca", "dem", "fic", "news"]

bnc_sentences = []

for subcorpora in subcorpora_dir:
    DIR = ".data/misc/BNC_texts/Texts/{}".format(subcorpora)
    
    # this is a nltk built-in class to handle BNC    
    bnc_reader = BNCCorpusReader(root=DIR, fileids=r'[A-K]/\w*/\w*\.xml')
    
    # get all the file names in the subdirectory     
    list_of_file_ids = []
    for root, dirs, files in os.walk(DIR):
        for filename in files:
            list_of_file_ids.append(filename)
    
    # in this, the normal tokens along with the extra-informed tags are given     
    pos_tagged_sents = bnc_reader.tagged_sents(fileids=list_of_file_ids)
    
    # retrieve the tokens
    tokens = []
    extra_tags = []
    for elem in pos_tagged_sents:
        token = [e[0] for e in elem]
        tokens.append(token)
    
    # retrieve the sentences     
    sentences = []
    for elem in tokens:
        sentences.append(' '.join(elem))

    bnc_sentences.append(sentences)

In [None]:
flat_bnc_sentences = [item for sublist in bnc_sentences for item in sublist]

In [None]:
# calculate the average length, for better insight on the sentences
lengths = []
for sentence in flat_bnc_sentences:
    token_sentence = nltk.word_tokenize(sentence)
    lengths.append(len(token_sentence))

sentence_average_len = statistics.mean(lengths)
print(sentence_average_len)

13.897688331736559


In [None]:
flat_bnc_sentences[:500]

['CHAPTER 8',
 'Polymers in Solution',
 '8.1 Thermodynamics of polymer solutions',
 'The interaction of long chain molecules with liquids is of considerable interest from both a practical and theoretical viewpoint .',
 'For linear and branched polymers , liquids can usually be found which will dissolve the polymer completely to form a homogeneous solution , whereas cross-linked networks will only swell when in contact with compatible liquids .',
 'In this chapter we shall deal with linear or branched polymers and treat the swelling of networks in chapter 14 .',
 'When an amorphous polymer is mixed with a suitable solvent , it disperses in the solvent and behaves as though it too is a liquid .',
 'In a good solvent , classed as one which is highly compatible with the polymer , the liquid-polymer interactions expand the polymer coil , from its unperturbed dimensions , in proportion to the extent of these interactions .',
 'In a ‘ poor ’ solvent , the interactions are fewer and coil expan

## Preprocess BNC

In [None]:
# since the BNC corpus contained many mathematics-specific sentences, we chose to control that by eliminating sentences containing numbers
def delete_num_sentences(text):
    numbers = []
    tokenized_text = text.split()

    for word in tokenized_text:
    # some numbers were in the form x.x or x,x
    if '.' in word or ',' in word:
        word = word.replace('.','').replace(',','')

    if word.isdigit():
        numbers.append(word)

    # if there is no number in the sentence
    if len(numbers)==0:
        return text
    else:
        return False

In [None]:
# examine if words with non english chatacters exist
def check_all_english(text):
    english_texts = []
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_text = tokenizer.tokenize(text)

    result = all(word.encode().isalpha() for word in tokenized_text if not word.isdigit())

    if result:
        return text
    else:
        return False

In [None]:
# Corpus preprocessing
text_corpus = []
for sen in flat_bnc_sentences:

    # skip sentences containing digits
    sen = delete_num_sentences(sen)
    if (sen):

        # skip sentences containing non-english words
        sen = check_all_english(sen)
        if (sen):

            if sen not in string.punctuation:  

                # clean surrounding whitespace
                sen = sen.strip()

                # remove urls     
                stripped_article = re.sub(r'^https?:\/\/.*[\r\n]*', '', sen, flags=re.MULTILINE)  

                # further preprocessing
                sen = sen.replace("( ","(")
                sen = sen.replace(" )",")")

                # # remove whitespace before punctuation
                sen = sen.replace(" .",".").replace(" ,",",").replace(" !","!").replace(" ?","?")

                # further preprocessing
                sen = sen.replace(", ,",",").replace(",',",",'").replace(",,",",").replace("..",".").replace("!!","!").replace("??","?")

                # remove content inside parentheses (usually unecessary information for our cause)     
                sen = re.sub(r'\([^)]*\)', '', sen)

                # remove big spaces
                sen = re.sub('\s{2,}', " ", sen)

                text_corpus.append(sen)

# # remove empty elements of a list
text_corpus = list(filter(None, text_corpus))

In [None]:
text_corpus[:500]

['Polymers in Solution',
 'The interaction of long chain molecules with liquids is of considerable interest from both a practical and theoretical viewpoint.',
 'For linear and branched polymers, liquids can usually be found which will dissolve the polymer completely to form a homogeneous solution, whereas cross-linked networks will only swell when in contact with compatible liquids.',
 'When an amorphous polymer is mixed with a suitable solvent, it disperses in the solvent and behaves as though it too is a liquid.',
 'In a good solvent, classed as one which is highly compatible with the polymer, the liquid-polymer interactions expand the polymer coil, from its unperturbed dimensions, in proportion to the extent of these interactions.',
 'In a ‘ poor ’ solvent, the interactions are fewer and coil expansion or perturbation is restricted.',
 'The fundamental thermodynamic equation used to describe these systems relates the Gibbs free energy function G to the enthalpy H and entropy S, i.e.

In [None]:
# expand abbreviations based on a predefined dictionary
abbr_dict={"what's":"what is", "what're":"what are", "who's":"who is", "who're":"who are", "where's":"where is", "where're":"where are", "when's":"when is", 
           "when're":"when are", "how's":"how is", "how're":"how are", "i'm":"i am", "we're":"we are", "you're":"you are", "they're":"they are", "it's":"it is",
           "he's":"he is", "she's":"she is", "that's":"that is", "there's":"there is", "there're":"there are", "i've":"i have", "we've":"we have", "you've":"you have",
    "they've":"they have", "who've":"who have", "would've":"would have", "not've":"not have", "i'll":"i will", "we'll":"we will", "you'll":"you will", "he'll":"he will",
    "she'll":"she will", "it'll":"it will", "they'll":"they will", "isn't":"is not", "wasn't":"was not", "aren't":"are not", "weren't":"were not", "can't":"can not",
    "couldn't":"could not", "don't":"do not", "didn't":"did not", "shouldn't":"should not", "wouldn't":"would not", "doesn't":"does not", "haven't":"have not",
    "hasn't":"has not", "hadn't":"had not", "won't":"will not", "what' s":"what is", "what' re":"what are", "who' s":"who is", "who' re":"who are", "where' s":"where is",
    "where' re":"where are", "when' s":"when is", "when' re":"when are", "how' s":"how is", "how' re":"how are", "i' m":"i am", "we' re":"we are", "you' re":"you are",
    "they' re":"they are", "it' s":"it is", "he' s":"he is", "she' s":"she is", "that' s":"that is", "there' s":"there is", "there' re":"there are", "i' ve":"i have",
    "we' ve":"we have", "you' ve":"you have", "they' ve":"they have", "who' ve":"who have", "would' ve":"would have", "not' ve":"not have", "i' ll":"i will", "we' ll":"we will",
    "you' ll":"you will", "he' ll":"he will", "she' ll":"she will", "it' ll":"it will", "they' ll":"they will", "isn' t":"is not", "wasn' t":"was not", "aren' t":"are not",
    "weren' t":"were not", "can' t":"can not", "couldn' t":"could not", "don' t":"do not", "didn' t":"did not", "shouldn' t":"should not", "wouldn' t":"would not",
    "doesn' t":"does not", "haven' t":"have not", "hasn' t":"has not", "hadn' t":"had not", "won' t":"will not"}

# this is just to be sure that the quotes will be those we have used in our abbreviation lexicon
quote_list = "‘’‛’❜'’`‘’"

abbr_corpus = []
for elem in text_corpus:

    # make all the single quotes, the one we have used in the abbreviation lexicon
    expand_abbr = ["'" if e in quote_list else e for e in elem]
    expand_abbr_string = ''.join(expand_abbr)
    abbr_corpus.append(expand_abbr_string)

# expand abbreviations
final_corpus_df = pd.DataFrame(abbr_corpus, columns=['Sentences']) 
final_corpus_df.replace(abbr_dict,regex=True,inplace=True)
final_corpus = final_corpus_df.Sentences.tolist()

In [None]:
final_corpus[:500]

['Polymers in Solution',
 'The interaction of long chain molecules with liquids is of considerable interest from both a practical and theoretical viewpoint.',
 'For linear and branched polymers, liquids can usually be found which will dissolve the polymer completely to form a homogeneous solution, whereas cross-linked networks will only swell when in contact with compatible liquids.',
 'When an amorphous polymer is mixed with a suitable solvent, it disperses in the solvent and behaves as though it too is a liquid.',
 'In a good solvent, classed as one which is highly compatible with the polymer, the liquid-polymer interactions expand the polymer coil, from its unperturbed dimensions, in proportion to the extent of these interactions.',
 "In a ' poor ' solvent, the interactions are fewer and coil expansion or perturbation is restricted.",
 'The fundamental thermodynamic equation used to describe these systems relates the Gibbs free energy function G to the enthalpy H and entropy S, i.e.

In [None]:
# remove possible duplicates
final_corpus_clean = list(dict.fromkeys(final_corpus))
print(len(final_corpus))
print(len(final_corpus_clean))

312297
265582


In [None]:
with open('./data/misc/bnc_sentences_unparsed.pkl', 'wb') as f:
    pickle.dump(final_corpus_clean, f)