In [1]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import nltk
import en_core_web_sm
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise
from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata
from autocorrect import Speller

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

nlp = en_core_web_sm.load(disable = ['ner', 'parser'])
nlp.max_length = 10000000

In [None]:
#import nltk
#for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset"):
    #nltk.download(dependency)

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=True))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [None]:
# eg implementation
text = TextPreprocessor(n_jobs=-1).transform(df_bbc['text'])

In [2]:
with open(Path(r'D:\Python ML\Envelope-key-words\Data\Raw\Envelopes\ENV10624.pdf-textract-text.txt'), 'r') as f:
    ENV10624 = f.read()

In [None]:
print(ENV10624)

In [None]:
#ENV10624_processed = TextPreprocessor(n_jobs=-1).transform(ENV10624_tokens)

In [3]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [4]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

In [5]:
# text normaliser very slow for large docs
def normalise_text(text, variety='BrE', user_abbrevs={}, verbose=False):
        #tokens = word_tokenize(text)
        #tokens = [token.strip() for token in tokens]
        # some issues in normalise package
        nt = normalise(text, variety=variety, user_abbrevs=user_abbrevs, verbose=verbose)
        
        normalised_text = ' '.join(nt)
        return normalised_text

In [6]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [7]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [8]:
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

In [9]:
def spell_check(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    spell = Speller()
    corrected_text = [spell(token) for token in tokens]
    return ' '.join(corrected_text)

In [10]:
def remove_single_chatacters(text):
    pattern = r'\b[a-zA-Z]\b'
    return re.sub(pattern, '', text)

In [14]:
def normalize_corpus(corpus, html_stripping=True, normalisation=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if normalisation:
            doc = normalise(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=True)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [None]:
ENV10624_processed = normalize_corpus(ENV10624, html_stripping=False, normalisation=False, remove_digits=False)

In [None]:
def preprocess_doc(doc: str, accented_char_removal=True, text_lower_case=True, check_spelling=True,
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_single_char=True, remove_digits=True):
    """ 
    Text document preprocessor for cleaning and normalising input text for NLP processes.

    Text preprocessing includes steps for:
            1. convert to utf-8 and remove accented characters
            2. lower case all characters
            3. check word spelling (using autocorrect) - WARNING slow for large documents 
            4. remove extra new lines, whitespace and tabs
            5. lemmatization (using SpaCy)
            6. remove special characters and punctuation
            7. remove all digits
            8. remove single characters
            9. remove stopwords (uses NLTK stopword list)

    Parameters
    -------------------
        doc (str): Input text as a string
        accented_char_removal (bool): default = True, active if true
        text_lower_case (bool): default = True, active if true
        check_spelling (bool): default = True, active if true
        text_lemmatization (bool): default = True, active if true
        special_char_removal (bool): default = True, active if true
        stopword_removal (bool): default = True, active if true
        remove single characters (bool): default = True, active if true
        remove_digits (bool): default = True, active if true
    
    Returns
    ------------------
        doc (str): processed document

    """
    if accented_char_removal:
        doc = remove_accented_chars(doc)
    if text_lower_case:
        doc = doc.lower()
    # remove extra new lines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
    if check_spelling:
        doc = spell_check(doc)
    if text_lemmatization:
        doc = lemmatize_text(doc)
    if special_char_removal:
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits = remove_digits)
    if remove_single_char:
        doc = remove_single_chatacters(doc)
    if stopword_removal:
        doc = remove_stopwords
    doc = remove_extra_whitespace_tabs(doc)

    return doc




In [11]:
user_abbrevs={'SA':'South Australia', 'S.A.':'South Australia','DMITRE':'Department of Munufacturing Investment Trade Resources and Energy', 'PIRSA': 'Primary Industries and Resources South Australia', 'JV': 'Joint Venture'}

doc1 = remove_accented_chars(ENV10624)
#doc2 = normalise_text(doc1, user_abbrevs=user_abbrevs)
doc3 = doc1.lower()
doc4 = re.sub(r'[\r|\n|\r\n]+', ' ',doc3)
doc5 = lemmatize_text(doc4)
special_char_pattern = re.compile(r'([{.(-)!}])')
doc6 = special_char_pattern.sub(" \\1 ", doc5)
doc6 = remove_special_characters(doc6, remove_digits=True)
doc7 = re.sub(' +', ' ', doc6)
doc8 = remove_stopwords(doc7, is_lower_case=True)

In [12]:
print(doc1[:2000])
print('\n')
print(doc3[:2000])
print('\n')
print(doc4[:2000])
print('\n')
print(doc5[:2000])
print('\n')
print(doc6[:2000])
print('\n')
print(doc7[:2000])
print('\n')
print(doc8[:3000])

Oren File Envelope
No. 10,624
THE EYRE PENINSULA JV PROJECT
JOINT ANNUAL REPORTS FOR THE PERIOD
19/9/200? TO 28/1?/2013
Submitted by
Adelaide Exploration Ltd and Quasar Resources Pty Ltd
2009
C 30/11/2014
Enquiries:
Customer Services
This report was supplied as part of the requirement to hold a mineral or
Resources and Energy Group
petroleum exploration tenement in the State of South Australia.
7th Floor
DMITRE accepts no responsibility for statements made,
or conclusions drawn, in the report or for the quality of text or drawings.
101 Grenfell Street, Adelaide 5000
This report is subject to copyright. Apart from fair dealing for the purposes of
study, research, criticism or review as permitted under the Copyright Act,
no part may be reproduced without written permission of
Telephone (08) 8463 3000
the Executive Director of the DMITRE Resources and Energy Group,
Facsimile: (08) 8204 1880
GPO Box 1264, Adelaide, SA 5001.
Government of South Australia
Department for Manufacturing,
Innova

In [23]:
doc9 = spell_check(doc3)

In [25]:
print(doc9[:3000])

open file envelope no. 10,624 the eyre peninsula j project joint annual reports for the period 19/9/200 ? to 28/1?/2013 submitted by adelaide exploration ltd and quasar resources pty ltd 2009 c 30/11/2014 enquiries : customer services this report was supplied as part of the requirement to hold a mineral or resources and energy group petroleum exploration tenement in the state of south australia. 7th floor mitre accepts no responsibility for statements made , or conclusions drawn , in the report or for the quality of text or drawings. 101 refell street , adelaide 5000 this report is subject to copyright. apart from fair dealing for the purposes of study , research , criticism or review as permitted under the copyright act , no part may be reproduced without written permission of telephone ( 08 ) 8463 3000 the executive director of the mitre resources and energy group , facsimile : ( 08 ) 8204 1880 go box 1264 , adelaide , sa 5001. government of south australia department for manufacturi

In [27]:
doc10 = nlp(doc3)

In [29]:
for token in doc10[100:500]:
    print(token.text, token.pos_)

for ADP
the DET
quality NOUN
of ADP
text NOUN
or CCONJ
drawings NOUN
. PUNCT

 SPACE
101 NUM
grenfell PROPN
street PROPN
, PUNCT
adelaide VERB
5000 NUM

 SPACE
this DET
report NOUN
is AUX
subject ADJ
to ADP
copyright NOUN
. PUNCT
apart ADV
from ADP
fair ADJ
dealing NOUN
for ADP
the DET
purposes NOUN
of ADP

 SPACE
study NOUN
, PUNCT
research NOUN
, PUNCT
criticism NOUN
or CCONJ
review NOUN
as SCONJ
permitted VERB
under ADP
the DET
copyright NOUN
act NOUN
, PUNCT

 SPACE
no DET
part NOUN
may VERB
be AUX
reproduced VERB
without ADP
written VERB
permission NOUN
of ADP

 SPACE
telephone NOUN
( PUNCT
08 NUM
) PUNCT
8463 NUM
3000 NUM

 SPACE
the DET
executive ADJ
director NOUN
of ADP
the DET
dmitre PROPN
resources NOUN
and CCONJ
energy NOUN
group NOUN
, PUNCT

 SPACE
facsimile NOUN
: PUNCT
( PUNCT
08 NUM
) PUNCT
8204 NUM
1880 NUM

 SPACE
gpo PROPN
box PROPN
1264 NUM
, PUNCT
adelaide PROPN
, PUNCT
sa PROPN
5001 NUM
. PUNCT

 SPACE
government NOUN
of ADP
south PROPN
australia PROPN

 SPACE
dep

In [31]:
def get_pos(text):
    text = nltk.word_tokenize(text)
    pos_string = nltk.pos_tag(text)
    return pos_string

In [32]:
doc11 = get_pos(doc8)
print(doc11[100:500])

[('approx', 'NN'), ('el', 'NN'), ('type', 'JJ'), ('mineral', 'JJ'), ('locality', 'NN'), ('wudinna', 'NN'), ('hill', 'JJ'), ('area', 'NN'), ('approximately', 'RB'), ('km', 'JJ'), ('ese', 'JJ'), ('streaky', 'NN'), ('bay', 'NN'), ('date', 'NN'), ('grant', 'NN'), ('november', 'NN'), ('date', 'NN'), ('expire', 'NN'), ('november', 'NN'), ('applicant', 'JJ'), ('adelaide', 'JJ'), ('resources', 'NNS'), ('n', 'JJ'), ('ltd', 'VBP'), ('file', 'JJ'), ('ref', 'NN'), ('mapsheet', 'NN'), ('yardea', 'NN'), ('kimba', 'NN'), ('wudinna', 'NN'), ('hili', 'VBZ'), ('scale', 'JJ'), ('broadacre', 'NN'), ('cboc', 'NN'), ('metre', 'NN'), ('schedule', 'NN'), ('p', 'NN'), ('charbah', 'NN'), ('lil', 'NN'), ('neliran', 'NN'), ('nukey', 'JJ'), ('bl', 'NN'), ('mqunt', 'NN'), ('fairvhiew', 'NN'), ('mountd', 'NN'), ('waulonna', 'NN'), ('hill', 'NN'), ('pinkawil', 'VBZ'), ('udinnah', 'JJ'), ('broaoacrh', 'NN'), ('broa', 'NN'), ('res', 'VBZ'), ('ancuita', 'JJ'), ('var', 'NN'), ('wariianboo', 'NN'), ('nanuma', 'JJ'), ('nan

In [33]:
print(doc11[0:100])

[('oren', 'NNS'), ('file', 'VBP'), ('envelope', 'NN'), ('eyre', 'NN'), ('peninsula', 'NN'), ('jv', 'NN'), ('project', 'NN'), ('joint', 'JJ'), ('annual', 'JJ'), ('report', 'NN'), ('period', 'NN'), ('submit', 'VBD'), ('adelaide', 'JJ'), ('exploration', 'NN'), ('ltd', 'NN'), ('quasar', 'JJ'), ('resources', 'NNS'), ('pty', 'VBP'), ('ltd', 'JJ'), ('c', 'NN'), ('enquiry', 'NN'), ('customer', 'NN'), ('service', 'NN'), ('report', 'NN'), ('supply', 'VB'), ('part', 'NN'), ('requirement', 'NN'), ('hold', 'VBP'), ('mineral', 'JJ'), ('resource', 'NN'), ('energy', 'NN'), ('group', 'NN'), ('petroleum', 'NN'), ('exploration', 'NN'), ('tenement', 'NN'), ('state', 'NN'), ('south', 'JJ'), ('australia', 'JJ'), ('th', 'NN'), ('floor', 'NN'), ('dmitre', 'NN'), ('accept', 'IN'), ('responsibility', 'NN'), ('statement', 'NN'), ('make', 'VBP'), ('conclusion', 'NN'), ('draw', 'VB'), ('report', 'NN'), ('quality', 'NN'), ('text', 'IN'), ('drawing', 'VBG'), ('grenfell', 'JJ'), ('street', 'NN'), ('adelaide', 'JJ'), 

In [43]:
len(doc11)

260908

In [13]:
def get_pos_spell_check(text):
    text = nltk.word_tokenize(text)
    pos_string = nltk.pos_tag(text)
    checked = ''
    for x in range(len(pos_string)):
        if pos_string[x][1] != 'NN':
            check = spell_check(pos_string[x][0])
            checked += check + ' '
        else:
            check = pos_string[x][0]
            checked += check + ' '
            
    return checked


In [3]:
import time

In [18]:
start = time.time()
doc12 = get_pos_spell_check(doc8[5000:8000])


In [19]:
end = time.time()
print(end-start)

81.96751523017883


In [20]:
print(doc12)



ay wirrulla ill cungena schedule kilometre da qeooen ooentrodarua odarua australia area km approx el gawler range np applicant adelaide exploration ltd eye enel etd type mineral mapsheet yardea locality pinkawillinie area approximately km northwest kimba date grant aug date expire aug tr pinkawillinie cp licence grant datum agd n file ref ahill na na hill hill kilometre expire corrobinn e hill kdonglawa scale schedule belt hill moon kilometre gda datus australia area km approx bef mounebiday applicant adelaide exploration ltd tipsheets gardener yarder locality lake acreman area approximately km northeast streaky bay date grant september date expire september el f waurea hill ex ld application lodge datum agd mountr sam palthrubie hill mount st mungo scale type mineral lakeeverard yarna hill old yarna kilometre file ref ltaba bamid schedule kilometre da qecce qeccenthcdatum tut australia area km approx el applicant adelaide exploration ltd type mineral tipsheets yarder locality thurlga 

In [21]:
import pkg_resources
from symspellpy import SymSpell, Verbosity

In [22]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

True

In [23]:
start = time.time()
suggestions = sym_spell.lookup_compound(doc8[5000:8000], max_edit_distance=2, ignore_non_words=True, transfer_casing=True, ignore_term_with_digits=True)

In [24]:
end = time.time()
print(end-start)

4.153019905090332


In [25]:
for sug in suggestions:
    print(sug)

a wirra la ill pungent schedule kilometre ada reopen open rotorua darla australia area pm approx al gawker range no applicant adelaide exploration ltd eye eel end type mineral map sheet yard a locality pink illinois area approximately pm northwest simba date grant aug date expire aug to pink illinois up licence grant datum and a file ref a hill a a hill hill kilometre expire corrosion a hill dongle a scale schedule belt hill moon kilometre ada dates australia area pm approx be money day applicant adelaide exploration ltd map sheets gardner yard a locality lake caravan area approximately pm northeast streaky bay date grant september date expire september elf area hill eld application lodge datum and mount sam path be hill mount st mango scale type mineral lake gerard yarn hill old yarn kilometre file ref lab amid schedule kilometre ada lecce recent datum tut australia area pm approx al applicant adelaide exploration ltd type mineral map sheets yard a locality thurgau ramp area approxima

In [4]:
from helper.text_preprocessor import preprocess_doc

In [5]:
start = time.time()
processed_doc = preprocess_doc(ENV10624, check_spelling=False)

In [6]:
end = time.time()
print(end-start)

28.5401930809021


In [7]:
print(processed_doc)

igure monax waddikee graphite prospect el courtesy monax mining limited balumbah pirie el el lincoln wilclo cut snake argent lacroma wilclo south ridgestone jamieson tank el lincoln ar buckleboo hs el adres moongi buckleboo hall kimboo woollinie pinkawillinie el investigator el grant figure exploration tenements buckleboo northern eyre peninsula el trafford wattle grove glencoe el monax el application el mincor kilometre buckleboo ela el adre ogs el trafford el investigator ar lincoln kalindi el trafford view point lincoln gap simmens hill el ml mlo ml ml ml memu eml eml eml eml whyalla eight mile creek beach el grant figure exploration licence strategic energy resource north whyalla eml tassie hill sunset hill el strategic ml ml ml ml el application rocky point chinaman creek yatala harbour station hill backy point fitzgerald bay point low kilometre eml roopena eml eml whyalla tregalana park eml mount young eml el mambray creek ar eml eml mount wild dog hill el eml mpl el strategic em