# Chapter 6: Handling Text

In [1]:
import re
from bs4 import BeautifulSoup
import unicodedata
import sys
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

## 6.1 Cleaning Text

#### Create text

In [2]:
textData = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

In [3]:
textData

['   Interrobang. By Aishwarya Henriette     ',
 'Parking And Going. By Karl Gautier',
 '    Today Is The night. By Jarek Prakash   ']

#### Strip whitespaces

In [4]:
stripWhitespace = [string.strip() for string in textData]

In [5]:
stripWhitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

#### Remove periods

In [6]:
removePeriods = [string.replace(".", "") for string in stripWhitespace]

In [7]:
removePeriods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

#### Create a custom transformation function

In [8]:
def capitalizer(string: str) -> str:
    return string.upper()

#### Apply function

In [9]:
[capitalizer(string) for string in removePeriods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

#### Use regex for powerful string operations

In [10]:
import re

In [11]:
def replaceLettersWithX(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

#### Apply function

In [12]:
[replaceLettersWithX(string) for string in removePeriods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

## 6.2 Parsing and Cleaning HTML


In [13]:
from bs4 import BeautifulSoup

#### Create some HTML code

In [14]:
html = """
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>"
       """

#### Parse html

In [15]:
soup = BeautifulSoup(html, "lxml")

In [16]:
soup

<html><body><div class="full_name"><span style="font-weight:bold">
       Masego</span> Azra</div>"
       </body></html>

#### Find the div with the class "full_name", show text

In [17]:
soup.find("div", { "class" : "full_name"}).text

'\n       Masego Azra'

## 6.3 Removing Punctuation

In [18]:
import unicodedata
import sys

#### Create text

In [19]:
text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

In [20]:
text_data

['Hi!!!! I. Love. This. Song....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

#### Create a dictionary of puntuation characters

In [21]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

#### For each string, remove any punctuation characters

In [22]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

## 6.4 Tokenizing Text

### Tokenize words

In [23]:
from nltk.tokenize import word_tokenize

#### Create text

In [24]:
string = "The science of today is the technology of tomorrow"

#### Tokenize words

In [25]:
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

### Tokenize sentences

In [26]:
from nltk.tokenize import sent_tokenize

#### Create text

In [27]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."

#### Tokenize sentences

In [28]:
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

## 6.5 Removing Stop Words

In [29]:
from nltk.corpus import stopwords

#### Have to download the set of stop words the first time

In [30]:
import nltk

In [31]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [33]:
stopwords

<WordListCorpusReader in 'C:\\Users\\micha\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

#### Create word tokens

In [32]:
tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park']

#### Load stop words

In [35]:
stop_words = stopwords.words('english')

In [36]:
len(stop_words)

179

In [37]:
type(stop_words)

list

In [42]:
stop_words[:179:17]

['i',
 'he',
 'themselves',
 'being',
 'as',
 'below',
 'when',
 'only',
 'd',
 "hadn't",
 "shouldn't"]

#### Remove stop words

In [43]:
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

## 6.6 Stemming Words

In [44]:
from nltk.stem.porter import PorterStemmer

#### Create word tokens

In [50]:
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

#### Create stemmer

In [46]:
porter = PorterStemmer()

#### Apply stemmer

In [51]:
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

## 6.7 Tagging Parts of Speech

In [52]:
from nltk import pos_tag
from nltk import word_tokenize

#### Create text

In [53]:
textData = "Chris loved outdoor running"

#### Use pre-trained part of speech tagger

Need to install tagger:

In [55]:
# import nltk
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [56]:
textTagged = pos_tag(word_tokenize(textData))

#### Show parts of speech

In [57]:
textTagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

#### Find all the nouns

In [58]:
[word for word, tag in textTagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

['Chris']

### Find only tweets with at least one noun

#### Create text

In [59]:
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]

#### Create list

In [60]:
taggedTweets = []

#### Tag each word and each tweet

In [61]:
for tweet in tweets:
    tweetTag = nltk.pos_tag(word_tokenize(tweet))
    taggedTweets.append([tag for word, tag in tweetTag])

In [63]:
tweetTag

[('San', 'NNP'),
 ('Francisco', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('awesome', 'JJ'),
 ('city', 'NN')]

In [62]:
taggedTweets

[['PRP', 'VBP', 'VBG', 'DT', 'NN', 'IN', 'NN'],
 ['JJ', 'NN', 'VBZ', 'DT', 'JJ', 'NN'],
 ['NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN']]

#### Use one-hot encoding to convert the tags into features

In [65]:
from sklearn.preprocessing import MultiLabelBinarizer

In [67]:
oneHotMulti = MultiLabelBinarizer()

In [68]:
oneHotMulti.fit_transform(taggedTweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

#### Show feature names

In [69]:
oneHotMulti.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

### Train a tagger

In [72]:
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

Download Brown

In [74]:
# import nltk
# nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

#### Get some text from the Brown Corpus, broken into sentences

In [75]:
sentences = brown.tagged_sents(categories='news')

In [76]:
len(sentences)

4623

#### Split into 4000 sentences for training and 623 for testing

In [77]:
train = sentences[:4000]
test = sentences[4000:]

#### Create backoff tagger

In [78]:
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

#### Show accuracy

In [79]:
trigram.evaluate(test)

0.8174734002697437

## 6.8 Encoding Text as a Bag of Words

In [82]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

#### Create text

In [83]:
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

#### Create the bag of words feature matrix

In [84]:
count = CountVectorizer()
bagOfWords = count.fit_transform(text_data)

#### Show feature matrix

In [85]:
bagOfWords

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

#### Use `toarray` to view a matrix of word counts for each obs

In [86]:
bagOfWords.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

#### Use `get_feature_names` to view the word associated with each feature

In [87]:
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

#### As a DF, for clarity

In [89]:
import pandas as pd

In [90]:
pd.DataFrame(bagOfWords.toarray(), columns=count.get_feature_names())

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0
