In [None]:
#default_exp doc_cleaner

In [None]:
#hide
from nbdev.showdoc import *

# Doc Cleaner
Removes noise (stopwords, very short tokens, punctuations digits; etc) from webpage content

In [None]:
#hide
#export
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation, digits

nltk.download('stopwords')

In [None]:
#hide
#export
remove_puncts = lambda lines: lines.translate(str.maketrans(punctuation,' '*len(punctuation)))
remove_digits = lambda lines: lines.translate(str.maketrans('', '', digits))

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
filter_by_length = lambda token: len(token) >= 3 and token not in stop_words

In [None]:
#hide
#export
def clean_text(doc_lines):
    """
    Removes punctuations and digits from document and returns tokens
    """
    doc_lines = remove_puncts(doc_lines)
    remove_digits = remove_puncts(doc_lines)
    return doc_lines.split()

In [None]:
show_doc(clean_text)

<h4 id="clean_text" class="doc_header"><code>clean_text</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>clean_text</code>(**`doc_lines`**)

Removes punctuations and digits from document and returns tokens

In [None]:
#hide
#export
def clean_tokens(tokens):
    """
    Removes stopwords, tokens with length less than 3 characters and stems them
    PorterStemmer is used for stemming
    """
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords]
    tokens = list(filter(filter_by_length, tokens))
    return tokens

In [None]:
show_doc(clean_tokens)

<h4 id="clean_tokens" class="doc_header"><code>clean_tokens</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>clean_tokens</code>(**`tokens`**)

Removes stopwords, tokens with length less than 3 characters and stems them
PorterStemmer is used for stemming

In [None]:
#hide
#export
def clean(doc):
    """
    Cleans a document by removing punctuations, digits, stopwords, tokens shorter than 3 characters
    """
    tokens = clean_text(doc)
    return clean_tokens(tokens)

In [None]:
show_doc(clean)

<h4 id="clean" class="doc_header"><code>clean</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>clean</code>(**`doc`**)

Cleans a document by removing punctuations, digits, stopwords, tokens shorter than 3 characters