In [None]:
!pip install gensim nltk spacy

In [None]:
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from gensim.utils import simple_preprocess

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model (English)
nlp = spacy.load("en_core_web_sm")

# Function to load and preprocess the text
def preprocess_text(file_path):
    # Step 1: Load the text file
    with open(file_path, 'r') as file:
        text = file.read().lower()  # Convert text to lowercase for consistency
    
    # Step 2: Tokenization using Gensim's simple_preprocess (it also removes punctuations)
    tokenized_text = simple_preprocess(text)
    
    # Step 3: Remove stopwords using NLTK
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokenized_text if word not in stop_words]
    
    # Step 4: Stemming using NLTK's PorterStemmer
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Step 5: Lemmatization using spaCy
    doc = nlp(" ".join(stemmed_tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    return lemmatized_tokens

# Sample file path
file_path = 'sample.txt'  # Replace with your file path

# Preprocess the text
preprocessed_text = preprocess_text(file_path)

# Print the preprocessed tokens
print("Preprocessed Tokens:")
print(preprocessed_text)
