# Text pre-processing for NLP tasks.

While training or fine tunning an LLM, large amounts of textual data required.
The raw textual data has to undergo a process of modification and cleaning of the text, on a level of a single word, reducing it to a format that can be given to a model as input for training.

In this notebook ill go over 4 pre-processing activities:

- Statistical analysis: understanding the structure of the raw text data, num of words, most\least used etc.

- Tokenization: seperating the text to 'word size' chunks - tokens

- Lemmatization: reducing a single word to its dictionary form

- Stemming: croping a word from its suffix to get the root.

There are 2 main python libraries that can preform this tasks and they are [nltk](https://www.nltk.org/) and [spacy](https://spacy.io/) as well as Pandas for basic CSV format data access, insight and manipuation.

In this notebook i'll compare the performance of the two libraries on the preprocessing tasks.


Note: spacy does not have stemming


The dataset for this lovely experiment:  
https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [24]:
import pandas as pd

# Load the spam.csv dataset from Google Drive
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/spam.csv', encoding='latin-1')
print(df.keys())
print(df.head())
print('\n\n')
print(str(df['Unnamed: 2'].count()) + '/' + str(len(df)))
print(str(df['Unnamed: 3'].count()) + '/' + str(len(df)))
print(str(df['Unnamed: 4'].count()) + '/' + str(len(df)))
print('\n\n')
# Looks like few columns are not relevant for my porpose
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
print(df.keys())
df = df.rename(columns={'v1': 'Category', 'v2': 'Message'})
print(df.keys())
print(df.head())

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  



50/5572
12/5572
6/5572



Index(['v1', 'v2'], dtype='object')
Index(['Category', 'Message'], dtype='object')
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a

In [25]:
# Print basic statistics on the data
print("Total number of SMS messages:", len(df))
print("Number of spam messages:", df['Category'].value_counts()['spam'])
print("Number of ham messages:", df['Category'].value_counts()['ham'])
print('\n\n')

# Average number of words per message
words_per_message = df['Message'].apply(lambda x: len(x.split()))
print("Average number of words per message:", words_per_message.mean())
print('\n\n')

# 5 most frequent words
all_words_series = pd.Series(' '.join(df['Message']).split())
word_frequencies = all_words_series.value_counts()
most_common_words = word_frequencies.head(5)
least_common_words = word_frequencies.tail(5)
print("5 most frequent words:")
for word, frequency in most_common_words.items():
    print(f"  {word}: {frequency}")
print('\n\n')

# Number of words that only appear once
words_appearing_once = sum(word_frequencies == 1)
print(f"Number of words that only appear once: {words_appearing_once}")
print(f"5 least frequent words:\n {least_common_words}")
print('\n\n')

Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825



Average number of words per message: 15.494436468054559



5 most frequent words:
  to: 2134
  you: 1622
  I: 1466
  a: 1327
  the: 1197



Number of words that only appear once: 9268
5 least frequent words:
 AOM          1
Box61,M60    1
1ER          1
Ages         1
Rofl.        1
Name: count, dtype: int64





In [26]:
def analyze_tokens(tokens):
  # Calculate the number of items in all lists
  total_items = 0
  for token_list in tokens:
    total_items += len(token_list)
  print(f"Total number of tokens: {total_items}")

  # Calculate the most and least used words
  all_words = []
  for token_list in tokens:
    all_words.extend(token_list)
  word_frequencies = pd.Series(all_words).value_counts()
  most_common_words = word_frequencies.head(5)
  least_common_words = word_frequencies.tail(5)

  # Print the results
  print("5 most frequent words:")
  for word, frequency in most_common_words.items():
    print(f"  {word}: {frequency}")

  print("\n5 least frequent words:")
  for word, frequency in least_common_words.items():
    print(f"  {word}: {frequency}")



In [27]:
# High level libraries were imported on purpose, instead of individual modules, to see the dependencis
import nltk
import spacy
import time


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# The first original SMS message for reference
print("Original Text:\n",df['Message'].loc[0])

# Tokenize the SMS text using nltk
start_time = time.time()
nltk_tokens = [nltk.word_tokenize(message) for message in df['Message']]
end_time = time.time()
nltk_tokenization_time = end_time - start_time

print("NLTK tokenization:\n",nltk_tokens[0])
print("\n\n")

# Tokenize the SMS text using spaCy
start_time = time.time()
#spacy_tokens = [nlp(message) for message in df['Message']]
spacy_tokens = [[token for token in nlp(message)] for message in df['Message']]
end_time = time.time()

tokens_spacy = []
lemmas_spacy = []
# extract text and lemmas from spacy
for list_of_tokens in spacy_tokens:
  lst = [list_of_tokens[i].text for i in range(len(list_of_tokens))]
  tokens_spacy.append(lst)
  lst = [list_of_tokens[i].lemma_ for i in range(len(list_of_tokens))]
  lemmas_spacy.append(lst)

spacy_tokenization_time = end_time - start_time
print("spaCy tokenization:\n",spacy_tokens[0])
print("\n\n")


print("Time complexity of NLTK tokenization:", nltk_tokenization_time)
analyze_tokens(nltk_tokens)
print('\n\n')

print("Time complexity of spaCy tokenization:", spacy_tokenization_time)
analyze_tokens(tokens_spacy)

Original Text:
 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
NLTK tokenization:
 ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']



spaCy tokenization:
 [Go, until, jurong, point, ,, crazy, .., Available, only, in, bugis, n, great, world, la, e, buffet, ..., Cine, there, got, amore, wat, ...]



Time complexity of NLTK tokenization: 1.2772295475006104
Total number of tokens: 104193
5 most frequent words:
  .: 4886
  to: 2148
  I: 1956
  you: 1888
  ,: 1871

5 least frequent words:
  creativity: 1
  typical: 1
  strongly: 1
  08717168528: 1
  bitching: 1



Time complexity of spaCy tokenization: 64.05909442901611
Total number of tokens: 103533
5 most frequent words:
  .: 4945
  to: 2148
  I: 1988
  you: 1878
  ,: 1857

5 least frequent words:
  merry: 1
  x'mas: 1
  recieve: 1
  24h

In [29]:
print("find differences:")
for i in range(len(nltk_tokens)):
  if nltk_tokens[i] != tokens_spacy[i]:
    print(nltk_tokens[i])
    print(tokens_spacy[i])
    print('\n\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['Dunno', 'lei', '...', 'I', 'might', 'b', 'eatin', 'wif', 'my', 'frens', '...', 'If', 'Ì_', 'wan', 'to', 'eat', 'then', 'i', 'wait', '4', 'Ì_', 'lar']
['Dunno', 'lei', '...', 'I', 'might', 'b', 'eatin', 'wif', 'my', 'frens', '...', 'If', 'Ì', '_', 'wan', 'to', 'eat', 'then', 'i', 'wait', '4', 'Ì', '_', 'lar']



['FREE', 'entry', 'into', 'our', 'å£250', 'weekly', 'comp', 'just', 'send', 'the', 'word', 'WIN', 'to', '80086', 'NOW', '.', '18', 'T', '&', 'C', 'www.txttowin.co.uk']
['FREE', 'entry', 'into', 'our', 'å£250', 'weekly', 'comp', 'just', 'send', 'the', 'word', 'WIN', 'to', '80086', 'NOW', '.', '18', 'T&C', 'www.txttowin.co.uk']



['Say', 'this', 'slowly.', '?', 'GOD', ',', 'I', 'LOVE', 'YOU', '&', 'amp', ';', 'I', 'NEED', 'YOU', ',', 'CLEAN', 'MY', 'HEART', 'WITH', 'YOUR', 'BLOOD.Send', 'this', 'to', 'Ten', 'special', 'people', '&', 'amp', ';', 'u', 'c', 'miracle', 'tomorrow', ',', 'do', 'it', ',', 'pls', ',', 'pl

# Example of different tokenization for same SMS message:


NLTK Tokens: ['Where', 'are', 'you', '?', 'You', 'said', 'you', 'would', 'be', 'here', 'when', 'I', 'woke', '...', ':', '-', '(']


spaCy Tokens: ['Where', 'are', 'you', '?', 'You', 'said', 'you', 'would', 'be', 'here', 'when', 'I', 'woke', '...', ':-(']

This example demonstrates the difference between the two tokenizers. While NLTK is simple tokenizer that works fast and seperates the text to special signs, spaCy is more intelligent and realises that the :-( is a smile and keeps it as a token instead of seperating it to 3 tokens like NLTK. For this reason also NLTK has more tokens then spaCy. Also we can notice that in the 'least frequent words' of spaCy like '24hrs' or 'x'mas', which have a meaning as a token and not gettin seperated to '24' and 'hrs' or 'x' and 'mas'. This smartness comes with a price in a form of performance. SpaCy tokenization takes much longer than NLTK because it not just creating the tokens but a Doc object containing various information about the processed text such as tokens, lemmas, language used, POS (part of speech) etc.

In [30]:
# Lemmatize the SMS text using nltk
start_time = time.time()
nltk_lemmas = [[nltk.stem.WordNetLemmatizer().lemmatize(word) for word in tokens] for tokens in nltk_tokens]
end_time = time.time()
nltk_lemmatization_time = end_time - start_time
print("NLTK lemmatization:\n",nltk_lemmas[0])
print("\n\n")


# Lemmatize the SMS text using spaCy
start_time = time.time()
#spacy_lemmas = [[token for token in doc] for doc in spacy_tokens]
# Lemmatization already in the doc object and was extracted earlier
end_time = time.time()
spacy_lemmatization_time = end_time - start_time
print("spaCy lemmatization:\n",lemmas_spacy[0])
print("\n\n")

print("Time complexity of NLTK lemmatization:", nltk_lemmatization_time)
#nltk_lemma_df = pd.DataFrame({'messages': [item for sublist in nltk_lemmas for item in sublist]})
analyze_tokens(nltk_lemmas)

print("Time complexity of spaCy lemmatization:", spacy_lemmatization_time)
#spacy_lemma_df = pd.DataFrame({'messages': [item for sublist in spacy_lemmas for item in sublist]})
analyze_tokens(lemmas_spacy)

NLTK lemmatization:
 ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']



spaCy lemmatization:
 ['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...']



Time complexity of NLTK lemmatization: 1.479539394378662
Total number of tokens: 104193
5 most frequent words:
  .: 4886
  to: 2148
  I: 1956
  you: 1888
  ,: 1871

5 least frequent words:
  85233: 1
  latelyxxx: 1
  donyt: 1
  hun-onbus: 1
  bitching: 1
Time complexity of spaCy lemmatization: 3.075599670410156e-05
Total number of tokens: 103533
5 most frequent words:
  .: 4945
  I: 3722
  be: 3260
  to: 2309
  you: 2217

5 least frequent words:
  problematic: 1
  unconscious: 1
  abnormally: 1
  9755: 1
  bitching: 1


# Difference in Lemmatization

By analysing the most frequint words that the two lemmatizers generated, we can notice then spCy reduce words like "I'm" to "I" and "am" which then reduced to its lemma - be. This kind of preprocessing is much more acurate for understanding the language.

Another not that the lemma of spacy been done already and was extracted earlier along with tokens, hence the computational time has really no meaning here.

In [31]:
print("Original Text:\n", df['Message'].loc[0])
# Stem the SMS text using nltk
start_time = time.time()
nltk_stems = [[nltk.stem.PorterStemmer().stem(word) for word in tokens] for tokens in nltk_tokens]
end_time = time.time()
nltk_stemming_time = end_time - start_time
print("NLTK stemming:\n",nltk_stems[0])
print("\n\n")

# Stem the SMS text using spaCy - as been noted before spacy does not have stemming functionality

print("Time complexity of NLTK stemming:", nltk_stemming_time)
nltk_stem_df = pd.DataFrame({'messages': [item for sublist in nltk_stems for item in sublist]})
analyze_tokens(nltk_stems)


Original Text:
 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
NLTK stemming:
 ['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...']



Time complexity of NLTK stemming: 2.679251194000244
Total number of tokens: 104193
5 most frequent words:
  .: 4886
  i: 2900
  to: 2241
  you: 2228
  ,: 1871

5 least frequent words:
  age16.150ppermesssubscript: 1
  eggs-pert: 1
  potato: 1
  headû_: 1
  piti: 1


Stemming chops typicly the ending of a word in an attempt to reduce it to root form. As we can see "crazy" became "crazi" and "Avaliable" became "avail".

Depends on application, stemming may be a usefull preprocess to preform, but important to understand that the output can be consist of not meaningfull, not to say made up, words.

# Web Scraping

fun way to find textual data online

In [32]:
# prompt: Use BeautifulSoup to scrape text data from a wikipidia NLP page in English.
# Perform tokenization, lemmatization, and stemming on the scraped text with nltk.
# Print word statistics on the scraped data before and after text processing.

import requests
from bs4 import BeautifulSoup

# Download the nltk data
nltk.download('stopwords')

# Define the URL to scrape
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

# Fetch the content from the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the text from the page
text = soup.get_text()

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Lemmatize the tokens
lemmatized_tokens = [nltk.stem.WordNetLemmatizer().lemmatize(token) for token in tokens]

# Stem the tokens
stemmed_tokens = [nltk.stem.PorterStemmer().stem(token) for token in tokens]

# Calculate the number of tokens, lemmas, and stems
num_tokens = len(tokens)
num_lemmas = len(lemmatized_tokens)
num_stems = len(stemmed_tokens)

# Print the word statistics
print("Number of tokens:", num_tokens)
print("Number of lemmas:", num_lemmas)
print("Number of stems:", num_stems)

# Print the most frequent tokens, lemmas, and stems
most_frequent_tokens = nltk.FreqDist(tokens).most_common(10)
most_frequent_lemmas = nltk.FreqDist(lemmatized_tokens).most_common(10)
most_frequent_stems = nltk.FreqDist(stemmed_tokens).most_common(10)

print("Most frequent tokens:", most_frequent_tokens)
print("Most frequent lemmas:", most_frequent_lemmas)
print("Most frequent stems:", most_frequent_stems)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of tokens: 9109
Number of lemmas: 9109
Number of stems: 9109
Most frequent tokens: [(',', 505), ('.', 474), ('of', 267), ('the', 255), ('(', 209), (')', 207), ('and', 148), ('a', 145), ('in', 107), ('to', 106)]
Most frequent lemmas: [(',', 505), ('.', 474), ('of', 267), ('the', 255), ('(', 209), (')', 207), ('a', 192), ('and', 148), ('in', 107), ('to', 106)]
Most frequent stems: [(',', 505), ('.', 474), ('the', 291), ('of', 267), ('(', 209), (')', 207), ('a', 158), ('and', 148), ('languag', 121), ('in', 118)]


In [33]:
# Install hebspacy
#!pip install hebspacy
# The library fails to install. As far as i could find out, there is a lisence isuese with spaCy and hebrew models
# so currently they are not avaliabe.

In [34]:
# The model 'he_core_news_sm' not avaliable in spacy so its not possiable to try the preprocessing on right to left language
# without training my own specific model for it

# # Load the Hebrew language model
# nlp = spacy.load('he_core_news_sm')

# # Process a sample text
# text = "גנן גדל דגן בגן שנמצא באפקה"
# doc = nlp(text)

# # Print tokens and their POS tags
# for token in doc:
#     print(token.text, token.lemma_, token.pos_)