# Links
- [Kaggle](https://www.kaggle.com/code/balatmak/text-preprocessing-steps-and-universal-pipeline)

# To Do
Create a one functoin code to run everything on the pipeline

In [1]:
import nltk

In [2]:
example_text = """
An explosion targeting a tourist bus has injured at least 16 people near the Grand Egyptian Museum, 
next to the pyramids in Giza, security sources say E.U.

South African tourists are among the injured. Most of those hurt suffered minor injuries, 
while three were treated in hospital, N.A.T.O. say.

http://localhost:8888/notebooks/Text%20preprocessing.ipynb

@nickname of twitter user and his email is email@gmail.com . 

A device went off close to the museum fence as the bus was passing on 16/02/2012.
"""

# Tokenization

Assume splitting text into **Tokens** (Words, sentences etc.)

- Spacy tokenized some weird staff like \n, \n\n, but was able to handle urls, emails and twitter-like mentions. Also we see that nltk tokenized abbreviations without the last .

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

nltk_words = word_tokenize(example_text)
display(f"Tokenized words: {nltk_words}")

"Tokenized words: ['An', 'explosion', 'targeting', 'a', 'tourist', 'bus', 'has', 'injured', 'at', 'least', '16', 'people', 'near', 'the', 'Grand', 'Egyptian', 'Museum', ',', 'next', 'to', 'the', 'pyramids', 'in', 'Giza', ',', 'security', 'sources', 'say', 'E.U', '.', 'South', 'African', 'tourists', 'are', 'among', 'the', 'injured', '.', 'Most', 'of', 'those', 'hurt', 'suffered', 'minor', 'injuries', ',', 'while', 'three', 'were', 'treated', 'in', 'hospital', ',', 'N.A.T.O', '.', 'say', '.', 'http', ':', '//localhost:8888/notebooks/Text', '%', '20preprocessing.ipynb', '@', 'nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email', '@', 'gmail.com', '.', 'A', 'device', 'went', 'off', 'close', 'to', 'the', 'museum', 'fence', 'as', 'the', 'bus', 'was', 'passing', 'on', '16/02/2012', '.']"

In [4]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

doc = nlp(example_text)
spacy_words = [token.text for token in doc]
display(f"Tokenized words: {spacy_words}")

"Tokenized words: ['\\n', 'An', 'explosion', 'targeting', 'a', 'tourist', 'bus', 'has', 'injured', 'at', 'least', '16', 'people', 'near', 'the', 'Grand', 'Egyptian', 'Museum', ',', '\\n', 'next', 'to', 'the', 'pyramids', 'in', 'Giza', ',', 'security', 'sources', 'say', 'E.U.', '\\n\\n', 'South', 'African', 'tourists', 'are', 'among', 'the', 'injured', '.', 'Most', 'of', 'those', 'hurt', 'suffered', 'minor', 'injuries', ',', '\\n', 'while', 'three', 'were', 'treated', 'in', 'hospital', ',', 'N.A.T.O.', 'say', '.', '\\n\\n', 'http://localhost:8888', '/', 'notebooks', '/', 'Text%20preprocessing.ipynb', '\\n\\n', '@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com', '.', '\\n\\n', 'A', 'device', 'went', 'off', 'close', 'to', 'the', 'museum', 'fence', 'as', 'the', 'bus', 'was', 'passing', 'on', '16/02/2012', '.', '\\n']"

In [32]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x3127af590>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x3127af290>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x3127c7990>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x312acd350>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x3127dd750>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x3127c7ae0>)]

# Cleaning

## Lower Casing

In [24]:
example_text.lower()

'\nan explosion targeting a tourist bus has injured at least 16 people near the grand egyptian museum, \nnext to the pyramids in giza, security sources say e.u.\n\nsouth african tourists are among the injured. most of those hurt suffered minor injuries, \nwhile three were treated in hospital, n.a.t.o. say.\n\nhttp://localhost:8888/notebooks/text%20preprocessing.ipynb\n\n@nickname of twitter user and his email is email@gmail.com . \n\na device went off close to the museum fence as the bus was passing on 16/02/2012.\n'

## Punctuation Removal
+ Do after tokenization
+ Useful for TF-IDF, CountVectorization, BinaryVectorization etc.

In [6]:
import string

display(f"Punctuation symbols: {string.punctuation}")

'Punctuation symbols: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
text_with_punct = "@nickname of twitter user, and his email is email@gmail.com ."

In [8]:
text_without_punct = text_with_punct.translate(str.maketrans('', '', string.punctuation))
display(f"Text without punctuation: {text_without_punct}")

'Text without punctuation: nickname of twitter user and his email is emailgmailcom '

In [9]:
# Here, emails were not detected
# In tokenization, punctuation symbols were parsed as single tokens, so better way would be to tokenize first and then remove punctuation symbols.

doc = nlp(text_with_punct)
tokens = [t.text for t in doc]
# python 
tokens_without_punct_python = [t for t in tokens if t not in string.punctuation]
display(f"Python based removal: {tokens_without_punct_python}")

tokens_without_punct_spacy = [t.text for t in doc if t.pos_ != 'PUNCT']
display(f"Spacy based removal: {tokens_without_punct_spacy}")

"Python based removal: ['@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com']"

"Spacy based removal: ['@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com']"

## Stopwords Removal
+ Words that usually does not bring additional meaning

In [10]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [11]:
from nltk.corpus import stopwords

nltk.download('stopwords', download_dir = "/Users/daver/Desktop/NLP_Lab_Exam_Codes/.venv/nltk_data")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daver/Desktop/NLP_Lab_Exam_Codes/.venv/nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
text = "This movie is just not good enough"

stopwords = stopwords.words('english')

text_without_stop_words = [t.text for t in nlp(text) if not t.is_stop]
display(f"Spacy text without stop words: {text_without_stop_words}")


text_without_stop_words = [t for t in word_tokenize(text) if t not in stopwords]
display(f"nltk text without stop words: {text_without_stop_words}")

"Spacy text without stop words: ['movie', 'good']"

"nltk text without stop words: ['This', 'movie', 'good', 'enough']"

## Other Cleaning
1. Removal of Emojis
1. Removal of Emoticons
1. Removal of URLs
1. Removal of HTML Tags


In [28]:
import re

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_emoticons(data):
    emoticons = re.compile(r'(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)')
    return re.sub(emoticons, '', data)

def remove_urls(data):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', data)

def remove_html(data):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', data)

def clean_text(data):
    data = remove_emojis(data)
    data = remove_emoticons(data)
    data = remove_urls(data)
    data = remove_html(data)
    return data

In [30]:
import random
import faker
from faker.providers import internet

fake = faker.Faker()

emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "😂", "🤣", "😊", "😇"]
emoticons = [":)", ":(", ":D", ":O", ":P", ";)", ":*", ":/", ":|", ":$", ":^)", ":-)", ":=)", ":]", ":}", ":>", ":3"]
html_tags = ["<p>", "</p>", "<a>", "</a>", "<div>", "</div>", "<span>", "</span>", "<img>", "</img>", "<body>", "</body>", "<header>", "</header>", "<footer>", "</footer>"]


random_text = " ".join(fake.text().split()[:30])
random_text += " " + random.choice(emojis)
random_text += " " + random.choice(emoticons)
random_text += " " + fake.url()
random_text += " " + random.choice(html_tags)

print(random_text)


Stay year where amount body charge. Choice option represent. Like four pretty wish consumer responsibility. Want buy clearly position issue wrong. Class science age experience report arm short. 😂 :| http://cooper.org/ </a>


In [31]:
clean = clean_text(random_text)
print(clean)

Stay year where amount body charge. Choice option represent. Like four pretty wish consumer responsibility. Want buy clearly position issue wrong. Class science age experience report arm short.    


# Normalization
normalization is a convertion of any non-text information into textual equivalent.


+ Converting dates to text
+ Numbers to text
+ Currency/Percent signs to text
+ Expanding of abbreviations (content dependent)
+ Spelling mistakes correction


In [None]:
# # This does not work, as scikit-learn has removed a function

# from normalise import normalise

# text = """
# On the 13 Feb. 2007, Theresa May announced on MTV news that the rate of childhod obesity had 
# risen from 7.3-9.6% in just 3 years , costing the N.A.T.O £20m
# """

# user_abbr = {
#     "N.A.T.O": "North Atlantic Treaty Organization"
# }

# normalized_tokens = normalise(word_tokenize(text), user_abbrevs=user_abbr, verbose=False)
# display(f"Normalized text: {' '.join(normalized_tokens)}")

# Stemming and Lemmatization
1. **Stemming** - Process of reducing inflection in words to their root forms
    + Not very reliable, as it's just truncating the words based on some rules.
2. **Lemmatization** - Reduces to its proper dictionary / root form

In [18]:
text = """
On the 13 Feb. 2007, Theresa May announced on MTV news that the rate of childhod obesity had 
risen from 7.3-9.6% in just 3 years , costing the N.A.T.O £20m
"""

In [19]:
from nltk.stem import PorterStemmer
import numpy as np

normalized_tokens = word_tokenize(text)

text = ' '.join(normalized_tokens)
tokens = word_tokenize(text)

In [20]:
porter = PorterStemmer()
stem_words = np.vectorize(porter.stem)
stemed_text = ' '.join(stem_words(tokens))
display(f"Stemed text: {stemed_text}")

'Stemed text: on the 13 feb. 2007 , theresa may announc on mtv news that the rate of childhod obes had risen from 7.3-9.6 % in just 3 year , cost the n.a.t.o £20m'

In [21]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
lemmatized_text = ' '.join(lemmatize_words(tokens))
display(f"nltk lemmatized text: {lemmatized_text}")

'nltk lemmatized text: On the 13 Feb. 2007 , Theresa May announced on MTV news that the rate of childhod obesity had risen from 7.3-9.6 % in just 3 year , costing the N.A.T.O £20m'

In [22]:
# Spacy is better than nltk in lemmatization

lemmas = [t.lemma_ for t in nlp(text)]
display(f"Spacy lemmatized text: {' '.join(lemmas)}")

'Spacy lemmatized text: on the 13 Feb. 2007 , Theresa may announce on MTV news that the rate of childhod obesity have rise from 7.3 - 9.6 % in just 3 year , cost the N.A.T.O £ 20 m'