In [11]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [12]:
filename = 'data/text_example.txt'
with open(filename, 'rt') as handler:
    text = handler.read()

Remove special pattern

In [3]:
## some regex samples
email_regex = re.compile(r"[\w.-]+@[\w.-]+")
url_regex = re.compile(r"(http|www)[^\s]+")
date_regex = re.compile(r"[\d]{2,4}[ -/:]*[\d]{2,4}([ -/:]*[\d]{2,4})?") # a way to match date

In [5]:
def clean_special_patterns(s):
    s = url_regex.sub("", s)
    s = email_regex.sub("", s)
    s = date_regex.sub("", s)
    return s

s = """Applications: 
www.aa.frdfaunefehofer.de/defe/referfefenzenefe/afeda-cenfeter.html
http://www.ifefis.fe.com
email: fowjfoj@fwjofj.djfow
Kennziffer: IIS-2020-12-23
Bewerbungsfrist:
"""
clean_special_patterns(s)

'Applications: \n\n\nemail: \nKennziffer: IIS-\nBewerbungsfrist:\n'

Tokenize

In [23]:
tokens = word_tokenize(text)

Remove punctions

In [32]:
words = [word.lower() for word in tokens if word.isalpha()]

Remove stop words

In [41]:
stop_words = stopwords.words('english')
words = [word for word in words if not word in stop_words]

An alternative way to remove punctions and stopwords can make a Trie regex of them, then remove them from the whole text, no need to tokenize them. [Check it here].

Steming

In [35]:
porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

Text cleaning pipeline

In [40]:
def clean_text(text):
    """clean text by
    clean_special_patterns: email, date, url, etc.
    remove punctions, stop words
    stem words
    
    output
    --------
    list: stemmed words
    """
    s = clean_special_patterns(text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if not word in stop_words]
    stemmed_words = [porter.stem(word) for word in words]
    return stemmed_words