In [64]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from Trie import Trie

In [12]:
filename = 'data/text_example.txt'
with open(filename, 'rt') as handler:
    text = handler.read()
    
stop_words = stopwords.words('english')
punc = string.punctuation

Remove special pattern

In [67]:
def clean_special_patterns(text):
    """Remove special patterns - email, url, date etc."""
    email_regex = re.compile(r"[\w.-]+@[\w.-]+")
    url_regex = re.compile(r"(http|www)[^\s]+")
    date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date
    ## remove
    text = url_regex.sub("", text)
    text = email_regex.sub("", text)
    text = date_regex.sub("", text)
    return text

s = """Applications: 
www.aa.frdfaunefehofer.de/defe/referfefenzenefe/afeda-cenfeter.html 
http://www.ifefis.fe.com
email: fowjfoj@fwjofj.djfow
Kennziffer: IIS-2020-12-23
Bewerbungsfrist:
"""
print(clean_special_patterns(s).strip)

Applications: 
 

email: 
Kennziffer: IIS-
Bewerbungsfrist:


In [52]:
def make_regex(input_list):
    """Build regex from trie structure.
    """
    t = Trie()
    for w in input_list:
        t.add(w)
    regex = re.compile(r"\b" + t.pattern() + r"\b", re.IGNORECASE)
    return regex

In [70]:
def clean_stopwords(text):
    stop_regex = make_regex(stop_words)
    text = stop_regex.sub("", text)
    return text
    
def clean_punct(text):
    punc_regex = re.compile('[%s]'%re.escape(string.punctuation))
    text = punc_regex.sub("", text)
    return text

Tokenize

In [23]:
tokens = word_tokenize(text)

Remove punctions

In [32]:
words = [word.lower() for word in tokens if word.isalpha()]

Remove stop words

In [41]:
stop_words = stopwords.words('english')
words = [word for word in words if not word in stop_words]

An alternative way to remove punctions and stopwords can make a Trie regex of them, then remove them from the whole text, no need to tokenize them. [Check it here].

Steming

In [35]:
porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

Text cleaning pipeline

In [40]:
def clean_text(text):
    """clean text by
    clean_special_patterns: email, date, url, etc.
    remove punctions, stop words
    stem words
    
    output
    --------
    list: stemmed words
    """
    s = clean_special_patterns(text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if not word in stop_words]
    stemmed_words = [porter.stem(word) for word in words]
    return stemmed_words