In [1]:
# Import necessary libraries
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Sample text
text = "Text preprocessing is an important step in natural language processing. It involves tokenization, filtration, script validation, stop word removal, and stemming."

# Tokenization: Split the text into words or tokens
tokens = word_tokenize(text)

# Filtration: Remove non-alphanumeric characters and convert to lowercase
filtered_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token).lower() for token in tokens]

'''
re.sub() function is used to substitute (replace) all characters in token that are not letters (a to z and A to Z) or digits (0 to 9) with an empty string ''. 

'''

# Script Validation: You can use regular expressions to validate scripts (e.g., only keep words with Latin characters - letters from english alphabets, both uppercase and lowercase)
latin_tokens = [token for token in filtered_tokens if re.match('^[a-zA-Z]+$', token)]

# Stop Word Removal: Remove common stop words
stop_words = set(stopwords.words('english'))
filtered_tokens_no_stop = [token for token in latin_tokens if token not in stop_words]

# Stemming: Reduce words to their root form using Porter Stemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_no_stop]

# Display the results
print("Original Text:")
print(text)
print("\nTokenization:")
print(tokens)
print("\nFiltration:")
print(filtered_tokens)
print("\nScript Validation:")
print(latin_tokens)
print("\nStop Word Removal:")
print(filtered_tokens_no_stop)
print("\nStemming:")
print(stemmed_tokens)


Original Text:
Text preprocessing is an important step in natural language processing. It involves tokenization, filtration, script validation, stop word removal, and stemming.

Tokenization:
['Text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '.', 'It', 'involves', 'tokenization', ',', 'filtration', ',', 'script', 'validation', ',', 'stop', 'word', 'removal', ',', 'and', 'stemming', '.']

Filtration:
['text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '', 'it', 'involves', 'tokenization', '', 'filtration', '', 'script', 'validation', '', 'stop', 'word', 'removal', '', 'and', 'stemming', '']

Script Validation:
['text', 'preprocessing', 'is', 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', 'it', 'involves', 'tokenization', 'filtration', 'script', 'validation', 'stop', 'word', 'removal', 'and', 'stemming']

Stop Word Removal:
['text', 'preprocessing', 'important', 's

In [13]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# text = ' Veritatis obcaecati tenetur iure eius earum ut molestias architecto voluptate aliquam nihil, eveniet aliquid culpa officia aut! Impedit sit sunt quaerat, odit, tenetur error, harum nesciunt ipsum debitis quas aliquid.'

text = 'This is assumed to be a list of tokens that have been previously processed and cleaned, likely by removing non-alphanumeric characters and converting the tokens to lowercase.'

tokens = word_tokenize(text)

filtered_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token).lower() for token in tokens]

latin_characters = [token for token in filtered_tokens if re.match('^[a-zA-Z]+$', token) ]

stop_words = stopwords.words('english')
no_stop_words = [token for token in filtered_tokens if token not in stop_words]

stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

print(f'Text: {text}\n')
print(f'Tokens - \n{tokens} \n')
print(f'Filtered tokens - \n{filtered_tokens}\n')
print(f'Script Validation - \n{latin_characters}\n')
print(f'Stop words removal - \n{no_stop_words}\n')
print(f'Stemmed tokens - \n{stemmed_tokens}\n')

Text: This is assumed to be a list of tokens that have been previously processed and cleaned, likely by removing non-alphanumeric characters and converting the tokens to lowercase.

Tokens - 
['This', 'is', 'assumed', 'to', 'be', 'a', 'list', 'of', 'tokens', 'that', 'have', 'been', 'previously', 'processed', 'and', 'cleaned', ',', 'likely', 'by', 'removing', 'non-alphanumeric', 'characters', 'and', 'converting', 'the', 'tokens', 'to', 'lowercase', '.'] 

Filtered tokens - 
['this', 'is', 'assumed', 'to', 'be', 'a', 'list', 'of', 'tokens', 'that', 'have', 'been', 'previously', 'processed', 'and', 'cleaned', '', 'likely', 'by', 'removing', 'nonalphanumeric', 'characters', 'and', 'converting', 'the', 'tokens', 'to', 'lowercase', '']

Script Validation - 
['this', 'is', 'assumed', 'to', 'be', 'a', 'list', 'of', 'tokens', 'that', 'have', 'been', 'previously', 'processed', 'and', 'cleaned', 'likely', 'by', 'removing', 'nonalphanumeric', 'characters', 'and', 'converting', 'the', 'tokens', 'to

In [11]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '