In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

NLTK is a powerful library for natural language processing (NLP) in Python. It provides a wide range of tools and resources for tasks such as tokenization, stemming, lemmatization, and part-of-speech tagging. NLTK is widely used in the field of NLP and is available for free.

To use NLTK, you need to install it first. You can do this by running the following command in your terminal or command prompt:

In [None]:
# Download necessary NLTK data (run once)
if not nltk.data.path:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')   

In [None]:
# Define the preprocessing function
def preprocess_text(text):

    if not text or not isinstance(text, str):  # Handle non-string input
        return [], [], [], ""  # Return empty lists and an empty string

    # Convert text to lowercase
    text = text.lower()

    # Remove special characters using regex
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Replace non-ASCII characters with a space

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens if word.isalpha()]


    return filtered_tokens, stemmed_words, lemmatized_words, text
