standford

In [1]:
raw_docs = ["Here are some very simple basic sentences.",
"They won't be very interesting, I'm afraid.",
"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data."]

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]

In [4]:
#NLTK makes it easy to convert documents-as-strings into word-vectors, 
#a process called tokenizing
print(tokenized_docs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', "n't", 'be', 'very', 'interesting', ',', 'I', "'m", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]


removing punctuation

In [6]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    
    new_review = []
    for token in review: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]


Cleaning text of stopwords

In [9]:
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []
for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_docs_no_stopwords.append(new_term_vector)
            
print(tokenized_docs_no_stopwords)

[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]


Stemming and Lemmatizing

In [13]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        #final_doc.append(snowball.stem(word))
        final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['Here', 'simple', 'basic', 'sentence'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'example', 'learn', 'basic', 'text', 'cleaning', 'work', 'simple', 'data']]


Removing HTML entities and tags

In [20]:
import re, html

##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
# AUTHOR: Fredrik Lundh

def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16)) #16进制
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(html.entities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

test_string ="<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don't like the &quot;Chicken Soup for the Soul&quot; series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)"

print(test_string)
print(unescape(test_string))

<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don't like the &quot;Chicken Soup for the Soul&quot; series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)
<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don't like the "Chicken Soup for the Soul" series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)


In [33]:
from bs4 import BeautifulSoup as bs

soup = bs(unescape(test_string), 'lxml')
soup.get_text() #notice that it returns unicode!

'While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don\'t like the "Chicken Soup for the Soul" series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)'

tweeter thing

In [49]:
original_tweet = 'I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com'

1.escape html character

In [39]:
import html

In [40]:
html_parser = html.parser.HTMLParser()

In [51]:
tweet = html_parser.unescape(original_tweet)

  """Entry point for launching an IPython kernel.


In [52]:
tweet

'I luv my <3 iphone & you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com'

2.decoding data

In [53]:
tweet = tweet.encode().decode("utf8").encode('ascii','ignore')
tweet

b'I luv my <3 iphone & youre awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com'

Split Attached Words

In [64]:
cleaned = ' '.join(re.findall('[A-Z][^A-Z]*', original_tweet))
cleaned

'I luv my &lt;3 iphone &amp; you’re awsm apple.  Display Is Awesome, sooo happppppy 🙂 http://www.apple.com'

In [70]:
import itertools
#itertools.groupby() #https://docs.python.org/3/library/itertools.html#itertools.groupby

CLEANING TEXT FOR NATURAL LANGUAGE PROCESSING TASKS IN MACHINE LEARNING IN PYTHON

In [73]:
#http://ieva.rocks/2016/08/07/cleaning-text-for-nlp/

others

In [74]:
# Create a list of three strings.
incoming_reports = ["We are attacking on their left flank but are losing many men.", 
               "We cannot see the enemy army. Nothing else to report.", 
               "We are ready to attack but are waiting for your orders."]

In [75]:
# import word tokenizer
from nltk.tokenize import word_tokenize

# Apply word_tokenize to each element of the list called incoming_reports
tokenized_reports = [word_tokenize(report) for report in incoming_reports]

# View tokenized_reports
tokenized_reports

[['We',
  'are',
  'attacking',
  'on',
  'their',
  'left',
  'flank',
  'but',
  'are',
  'losing',
  'many',
  'men',
  '.'],
 ['We',
  'can',
  'not',
  'see',
  'the',
  'enemy',
  'army',
  '.',
  'Nothing',
  'else',
  'to',
  'report',
  '.'],
 ['We',
  'are',
  'ready',
  'to',
  'attack',
  'but',
  'are',
  'waiting',
  'for',
  'your',
  'orders',
  '.']]

In [76]:
# Import regex
import re

# Import string
import string


regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_reports_no_punctuation = []

for review in tokenized_reports:
    
    new_review = []
    for token in review: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_reports_no_punctuation.append(new_review)
    
tokenized_reports_no_punctuation

[['We',
  'are',
  'attacking',
  'on',
  'their',
  'left',
  'flank',
  'but',
  'are',
  'losing',
  'many',
  'men'],
 ['We',
  'can',
  'not',
  'see',
  'the',
  'enemy',
  'army',
  'Nothing',
  'else',
  'to',
  'report'],
 ['We',
  'are',
  'ready',
  'to',
  'attack',
  'but',
  'are',
  'waiting',
  'for',
  'your',
  'orders']]

In [77]:
from nltk.corpus import stopwords

tokenized_reports_no_stopwords = []
for report in tokenized_reports_no_punctuation:
    new_term_vector = []
    for word in report:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_reports_no_stopwords.append(new_term_vector)
            
tokenized_reports_no_stopwords

[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],
 ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],
 ['We', 'ready', 'attack', 'waiting', 'orders']]