In [3]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import spacy
import nltk

# Download necessary resources
nltk.download('stopwords')

# Load stopwords and initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

# Text preprocessing function
def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True)
    tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc if token.is_alpha]
    return {
        "original_tokens": tokens,
        "stemmed_tokens": stemmed_tokens,
        "lemmatized_tokens": lemmatized_tokens,
    }

# Load sample text from a file or use default text
try:
    with open("sample_text.txt", "r") as file:
        text = file.read()
except FileNotFoundError:
    text = "Natural Language Processing enables machines to process human language effectively."

# Preprocess the text
processed_data = preprocess_text(text)

# Print results
print("Original Tokens:")
print(processed_data["original_tokens"])

print("\nStemmed Tokens:")
print(processed_data["stemmed_tokens"])

print("\nLemmatized Tokens:")
print(processed_data["lemmatized_tokens"])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Original Tokens:
['natural', 'language', 'processing', 'enables', 'machines', 'process', 'human', 'language', 'effectively']

Stemmed Tokens:
['natur', 'languag', 'process', 'enabl', 'machin', 'process', 'human', 'languag', 'effect']

Lemmatized Tokens:
['natural', 'language', 'processing', 'enable', 'machine', 'process', 'human', 'language', 'effectively']
