# Supplementary: Text Data Preparation

Objectives:
- To introduce students to different text pre-processing techniques.
- Students will gain hands-on experience through examples.

***

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

## 1) Text Cleaning

### 1.1) Noise Removal

#### Stopwords

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already downloaded)
# nltk.download('stopwords')
# nltk.download('punkt')

Here are examples of stopwords

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
print(stopwords.words('french'))

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

In [5]:
print(stopwords.words('german'))

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', 'dass', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es', 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'euc

In [6]:
print(stopwords.words('chinese'))

['一', '一下', '一些', '一切', '一则', '一天', '一定', '一方面', '一旦', '一时', '一来', '一样', '一次', '一片', '一直', '一致', '一般', '一起', '一边', '一面', '万一', '上下', '上升', '上去', '上来', '上述', '上面', '下列', '下去', '下来', '下面', '不一', '不久', '不仅', '不会', '不但', '不光', '不单', '不变', '不只', '不可', '不同', '不够', '不如', '不得', '不怕', '不惟', '不成', '不拘', '不敢', '不断', '不是', '不比', '不然', '不特', '不独', '不管', '不能', '不要', '不论', '不足', '不过', '不问', '与', '与其', '与否', '与此同时', '专门', '且', '两者', '严格', '严重', '个', '个人', '个别', '中小', '中间', '丰富', '临', '为', '为主', '为了', '为什么', '为什麽', '为何', '为着', '主张', '主要', '举行', '乃', '乃至', '么', '之', '之一', '之前', '之后', '之後', '之所以', '之类', '乌乎', '乎', '乘', '也', '也好', '也是', '也罢', '了', '了解', '争取', '于', '于是', '于是乎', '云云', '互相', '产生', '人们', '人家', '什么', '什么样', '什麽', '今后', '今天', '今年', '今後', '仍然', '从', '从事', '从而', '他', '他人', '他们', '他的', '代替', '以', '以上', '以下', '以为', '以便', '以免', '以前', '以及', '以后', '以外', '以後', '以来', '以至', '以至于', '以致', '们', '任', '任何', '任凭', '任务', '企图', '伟大', '似乎', '似的', '但', '但是', '何', '何况', '何处', '何时', '作为', '你', '你们', '你的', '使得', '使用'

Let's try with short sentence first.

In [7]:
# Example text
text = "This is an example sentence with some stopwords."

# Tokenize the text
words = word_tokenize(text)

# Get the English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if word.lower() not in stop_words]

# Print the original and filtered words
print("Original Words:", words)
print("\nFiltered Words:", filtered_words)


Original Words: ['This', 'is', 'an', 'example', 'sentence', 'with', 'some', 'stopwords', '.']

Filtered Words: ['example', 'sentence', 'stopwords', '.']


Let's use the tokenized words

In [8]:
# Sample text
text = "The amber droplet hung from the branch, reaching fullness and ready to drop. \
It waited. While many of the other droplets were satisfied to form as big as they could and release, \
this droplet had other plans. It wanted to be part of history. \
It wanted to be remembered long after all the other droplets had dissolved into history. \
So it waited for the perfect specimen to fly by to trap and \
capture that it hoped would eventually be discovered hundreds of years in the future."

In [9]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the text
tokens_nltk = word_tokenize(text)

# Remove stopwords
filtered_tokens_nltk = [word for word in tokens_nltk if word.lower() not in stop_words]

# Print the original and filtered words
print("Original Words:", tokens_nltk)
print("\nFiltered Words:", filtered_tokens_nltk)

Original Words: ['The', 'amber', 'droplet', 'hung', 'from', 'the', 'branch', ',', 'reaching', 'fullness', 'and', 'ready', 'to', 'drop', '.', 'It', 'waited', '.', 'While', 'many', 'of', 'the', 'other', 'droplets', 'were', 'satisfied', 'to', 'form', 'as', 'big', 'as', 'they', 'could', 'and', 'release', ',', 'this', 'droplet', 'had', 'other', 'plans', '.', 'It', 'wanted', 'to', 'be', 'part', 'of', 'history', '.', 'It', 'wanted', 'to', 'be', 'remembered', 'long', 'after', 'all', 'the', 'other', 'droplets', 'had', 'dissolved', 'into', 'history', '.', 'So', 'it', 'waited', 'for', 'the', 'perfect', 'specimen', 'to', 'fly', 'by', 'to', 'trap', 'and', 'capture', 'that', 'it', 'hoped', 'would', 'eventually', 'be', 'discovered', 'hundreds', 'of', 'years', 'in', 'the', 'future', '.']

Filtered Words: ['amber', 'droplet', 'hung', 'branch', ',', 'reaching', 'fullness', 'ready', 'drop', '.', 'waited', '.', 'many', 'droplets', 'satisfied', 'form', 'big', 'could', 'release', ',', 'droplet', 'plans', '.

Compare number of words left after stopwords have been removed

In [10]:
print("Number of Original Words:", len(tokens_nltk))
print("\nNumber of Filtered Words:", len(filtered_tokens_nltk))

Number of Original Words: 94

Number of Filtered Words: 48


#### HTML tags

In [11]:
# Example HTML text with tags and formatting
html_text = """
<!DOCTYPE html>
<html>
<head>
<title>Sample HTML</title>
</head>
<body>
<h1>Welcome to my website</h1>
<p>This is a <b>sample</b> paragraph with <i>formatting</i>.</p>
<p>Here's a <a href="https://example.com">link</a> to another page.</p>
</body>
</html>
"""

In [12]:
from bs4 import BeautifulSoup
import re

def clean_html_tags(text):
    # Initialize BeautifulSoup with the HTML text
    soup = BeautifulSoup(text, 'html.parser')

    # Remove all HTML tags
    clean_text = soup.get_text(separator=' ')

    # Remove extra whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

In [13]:
cleaned_text = clean_html_tags(html_text)
print(cleaned_text)

Sample HTML Welcome to my website This is a sample paragraph with formatting . Here's a link to another page.


Remember this?

In [14]:
s = ("Good muffins cost $3.88\nin New York.  Please buy me\n"
      "two of them.\n\nThanks.")
s2 = ("Alas, it has not rained today. When, do you think, "
      "will it rain again?")
s3 = ("<p>Although this is <b>not</b> the case here, we must "
      "not relax our vigilance!</p>")

In [15]:
s2

'Alas, it has not rained today. When, do you think, will it rain again?'

In [16]:
nltk.regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)

['Alas',
 'it has not rained today',
 'When',
 'do you think',
 'will it rain again']

In [17]:
s3

'<p>Although this is <b>not</b> the case here, we must not relax our vigilance!</p>'

In [18]:
nltk.regexp_tokenize(s3, r'</?.>', gaps=False)

['<p>', '<b>', '</b>', '</p>']

In [19]:
nltk.regexp_tokenize(s3, r'</?.>', gaps=True)

['Although this is ',
 'not',
 ' the case here, we must not relax our vigilance!']

#### Special characters

- Punctuation
- '!', '@', '#', '$', '%', '^', '&', '*'
- Non-Alphanumeric Characters
- Whitespace

In [20]:
import re

def clean_punctuation(text):
    # Define a regular expression pattern to match punctuation and special characters
    # Matches any character that is not a word character (\w), space (\s), or underscore (_)
    punctuation_pattern = re.compile(r'[^\w\s]|_')

    # Replace punctuation and special characters with an empty string
    cleaned_text = re.sub(punctuation_pattern, '', text)

    return cleaned_text

# Example text with punctuation and special characters
text_with_punctuation = "Hello! How are you doing? I'm doing fine, thank you!"

# Clean punctuation and special characters
cleaned_text = clean_punctuation(text_with_punctuation)
print(cleaned_text)


Hello How are you doing Im doing fine thank you


### 1.2) Character Normalization

In [21]:
import unicodedata

def normalize_characters(text):
    # Normalize text by decomposing Unicode characters and then removing combining characters
    normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Convert text to lowercase
    normalized_text = normalized_text.lower()

    return normalized_text

# Example text with accented characters
text_with_accented_characters = "Café au Lait"

# Normalize characters
normalized_text = normalize_characters(text_with_accented_characters)
print(normalized_text)


cafe au lait


In [22]:
# !pip install textacy

In [23]:
from textacy import preprocessing

def normalize_characters_with_textacy(text):
    # Normalize text using Textacy's normalize_whitespace function
    normalized_text = preprocessing.normalize.whitespace(text)

    # Lowercase the text
    normalized_text = normalized_text.lower()

    return normalized_text

# Example text with accented characters
text_with_accented_characters = "Café au Lait"

# Normalize characters using Textacy
normalized_text = normalize_characters_with_textacy(text_with_accented_characters)
print(normalized_text)


café au lait


### 1.3) Data Masking

In [24]:
import re

def mask_email_addresses(text):
    # Define a regular expression pattern to match email addresses
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

    # Replace email addresses with a generic placeholder
    masked_text = re.sub(email_pattern, '[EMAIL]', text)

    return masked_text

def mask_phone_numbers(text):
    # Define a regular expression pattern to match phone numbers
    phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

    # Replace phone numbers with a generic placeholder
    masked_text = re.sub(phone_pattern, '[PHONE]', text)

    return masked_text

# Example text with email addresses and phone numbers
text_with_sensitive_info = "Please contact me at john.doe@example.com or 123-456-7890. Thank you!"

# Mask sensitive information
masked_text = mask_email_addresses(text_with_sensitive_info)
masked_text = mask_phone_numbers(masked_text)
print(masked_text)


Please contact me at [EMAIL] or [PHONE]. Thank you!


***

## 2) Text Pre-processing

### 2.1) Tokenization

In [25]:
text = "The amber droplet hung from the branch, reaching fullness and ready to drop. \
It waited. While many of the other droplets were satisfied to form as big as they could and release, \
this droplet had other plans. It wanted to be part of history. \
It wanted to be remembered long after all the other droplets had dissolved into history. \
So it waited for the perfect specimen to fly by to trap and \
capture that it hoped would eventually be discovered hundreds of years in the future."

# Before tokenization
print(text)

The amber droplet hung from the branch, reaching fullness and ready to drop. It waited. While many of the other droplets were satisfied to form as big as they could and release, this droplet had other plans. It wanted to be part of history. It wanted to be remembered long after all the other droplets had dissolved into history. So it waited for the perfect specimen to fly by to trap and capture that it hoped would eventually be discovered hundreds of years in the future.


Implementation from scratch

In [26]:
import string

# Convert text to lowercase
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))

# Split the text into words
tokens = text.split()

In [27]:
len(tokens)

86

In [28]:
# After tokenization
tokens

['the',
 'amber',
 'droplet',
 'hung',
 'from',
 'the',
 'branch',
 'reaching',
 'fullness',
 'and',
 'ready',
 'to',
 'drop',
 'it',
 'waited',
 'while',
 'many',
 'of',
 'the',
 'other',
 'droplets',
 'were',
 'satisfied',
 'to',
 'form',
 'as',
 'big',
 'as',
 'they',
 'could',
 'and',
 'release',
 'this',
 'droplet',
 'had',
 'other',
 'plans',
 'it',
 'wanted',
 'to',
 'be',
 'part',
 'of',
 'history',
 'it',
 'wanted',
 'to',
 'be',
 'remembered',
 'long',
 'after',
 'all',
 'the',
 'other',
 'droplets',
 'had',
 'dissolved',
 'into',
 'history',
 'so',
 'it',
 'waited',
 'for',
 'the',
 'perfect',
 'specimen',
 'to',
 'fly',
 'by',
 'to',
 'trap',
 'and',
 'capture',
 'that',
 'it',
 'hoped',
 'would',
 'eventually',
 'be',
 'discovered',
 'hundreds',
 'of',
 'years',
 'in',
 'the',
 'future']

Using NLTK library

```from nltk.tokenize import word_tokenize```

In [29]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the text
tokens_nltk = word_tokenize(text)

In [30]:
len(tokens_nltk)

86

### 2.2) Lemmatization/Stemming

Use NLTK library ```WordNetLemmatizer```

```
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
```

In [31]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Let's first try stemming some words

In [32]:
lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))

print("corpora :", lemmatizer.lemmatize("corpora"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos="a"))

# a denotes verb in "pos"
print("running :", lemmatizer.lemmatize("running", pos="v"))

# a denotes verb in "pos"
print("swam :", lemmatizer.lemmatize("swam", pos="v"))

print("swum :", lemmatizer.lemmatize("swum", pos="v"))

rocks : rock
corpora : corpus
better : good
running : run
swam : swim
swum : swim


Use Spacy library

*Note: Use the following command if getting a linkage error.*

```python -m spacy download en```

In [33]:
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

In [34]:
# Define a sample text
text = "Three men are walking."

# Process the text using spaCy
doc = nlp(text)

# Extract lemmatized tokens
lemmatized_tokens = [token.lemma_ for token in doc]

# Join the lemmatized tokens into a sentence
lemmatized_text = ' '.join(lemmatized_tokens)

# Print the original and lemmatized text
print("Original Text:", text)
print("\nLemmatized Text:", lemmatized_text)

Original Text: Three men are walking.

Lemmatized Text: three man be walk .


Let's use longer sentences.

In [35]:
text = "The amber droplet hung from the branch, reaching fullness and ready to drop. \
It waited. While many of the other droplets were satisfied to form as big as they could and release, \
this droplet had other plans. It wanted to be part of history. \
It wanted to be remembered long after all the other droplets had dissolved into history. \
So it waited for the perfect specimen to fly by to trap and \
capture that it hoped would eventually be discovered hundreds of years in the future."

# Process the text using spaCy
doc = nlp(text)

# Extract lemmatized tokens
lemmatized_tokens = [token.lemma_ for token in doc]

# Join the lemmatized tokens into a sentence
lemmatized_text = ' '.join(lemmatized_tokens)

# Print the original and lemmatized text
print("Original Text:", text)
print("\nLemmatized Text:", lemmatized_text)

Original Text: The amber droplet hung from the branch, reaching fullness and ready to drop. It waited. While many of the other droplets were satisfied to form as big as they could and release, this droplet had other plans. It wanted to be part of history. It wanted to be remembered long after all the other droplets had dissolved into history. So it waited for the perfect specimen to fly by to trap and capture that it hoped would eventually be discovered hundreds of years in the future.

Lemmatized Text: the amber droplet hang from the branch , reach fullness and ready to drop . it wait . while many of the other droplet be satisfied to form as big as they could and release , this droplet have other plan . it want to be part of history . it want to be remember long after all the other droplet have dissolve into history . so it wait for the perfect speciman to fly by to trap and capture that it hope would eventually be discover hundred of year in the future .


### 2.3) Part-of-speech Tagging

In [36]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagging(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Extract POS tags for each token in the text
    pos_tags = [(token.text, token.pos_) for token in doc]

    return pos_tags

# Example text
text = "I love eating pizza with my friends"

# Perform POS tagging
pos_tags = pos_tagging(text)
print(pos_tags)


[('I', 'PRON'), ('love', 'VERB'), ('eating', 'VERB'), ('pizza', 'NOUN'), ('with', 'ADP'), ('my', 'PRON'), ('friends', 'NOUN')]


### 2.4) Name-Entity Recognition (NER)

In [37]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def ner(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Extract named entities from the processed text
    entities = [(entity.text, entity.label_) for entity in doc.ents]

    return entities

# Example text
text = "KMUTT is located in Bangkok, Thailand."

# Perform NER
named_entities = ner(text)
print(named_entities)


[('KMUTT', 'ORG'), ('Bangkok', 'GPE'), ('Thailand', 'GPE')]


ORG: Organization\
GPE: Geopolitical Entity