In [2]:
pip install gensim nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK data files
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Initialize stemmer and lemmatizer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

# Load stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Preprocess the input text: tokenization, stemming, lemmatization."""
    # Tokenize using Gensim's simple_preprocess
    tokens = simple_preprocess(text, deacc=True)

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Apply stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]

    return lemmatized_tokens

def read_file(file_path):
    """Read the content of a text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def write_to_file(output_path, processed_data):
    """Write the processed data to a new file."""
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(' '.join(processed_data))

if __name__ == "__main__":
    # Path to the input text file
    input_file = "sample.txt"  # Replace with your file path
    output_file = "processed_sample.txt"

    try:
        # Read the file
        raw_text = read_file(input_file)

        # Preprocess the text
        processed_text = preprocess_text(raw_text)

        # Write the processed text to a file
        write_to_file(output_file, processed_text)

        print(f"Processed text saved to {output_file}")
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vamshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vamshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vamshi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Processed text saved to processed_sample.txt
