# Write a program for pre-processing of a text document such as stop word removal, stemming.

In [None]:
# Step 1: Import required libraries
import nltk  # NLTK (Natural Language Toolkit) library for text processing
from nltk.corpus import stopwords  # To get a list of stop words
from nltk.tokenize import word_tokenize  # To split text into individual words
from nltk.stem import PorterStemmer  # To reduce words to their root form

# Step 2: Download necessary NLTK resources
# These downloads are required to use certain NLTK functionalities like stopwords and tokenization
nltk.download('punkt')  # 'punkt' is used for tokenizing sentences and words
nltk.download('stopwords')  # English stop words list provided by NLTK

# nltk.download('punkt'): Downloads a pre-trained tokenizer to split text into sentences or words, 
# enabling easier manipulation of text.

# nltk.download('stopwords'): Downloads a list of common,
#  low-value words to exclude from analysis to focus on more meaningful content.

def preprocess_text(text):
    """
    Preprocesses the input text by removing stopwords and applying stemming.
    
    Parameters:
    - text (str): The input text to preprocess.
    
    Returns:
    - str: Processed text with stopwords removed and words stemmed.
    """
    
    # Step 3a: Tokenize text
    # Tokenization splits text into individual words (tokens), which helps in further processing.
    # We also convert text to lowercase to make processing case-insensitive.
    tokens = word_tokenize(text.lower())

    # Step 3b: Initialize the PorterStemmer and load English stopwords
    # PorterStemmer reduces words to their root form (e.g., "running" -> "run").
    stemmer = PorterStemmer()
    # stopwords.words('english') gives a set of common English stopwords like "the", "is", "in", etc.
    stop_words = set(stopwords.words("english"))

    # Step 3c: Remove stopwords and apply stemming
    # We iterate through each token, remove punctuation, filter out stopwords, and apply stemming.
    processed_words = [
        stemmer.stem(word)  # Stems the word to its root form
        for word in tokens  # For each word in the tokenized list
        if word.isalnum() and word not in stop_words  # Only keep words that are alphanumeric and not stopwords
    ]

    # Step 3d: Join words back into a single string
    # We join the processed list of words into a single string, separating each word with a space.
    processed_text = ' '.join(processed_words)

    return processed_text

# Step 4: Read text from a file and process it
# Open the text file in read mode with UTF-8 encoding to avoid Unicode errors
with open("Text.txt", "r", encoding="utf-8") as file:
    # Read the entire file content
    text = file.read()

# Process the loaded text by calling preprocess_text function
processed_text = preprocess_text(text)

# Display the processed text
print("Processed Text:", processed_text)


Processed Text: artifici intellig ai transform industri enabl new advanc healthcar financ transport ai help doctor make faster diagnos allow bank detect fraud even power car howev capabl rais ethic concern ai trust make critic decis prevent bia system machin learn branch ai allow comput learn data without explicit program technolog power tool like chatbot imag recognit softwar make digit interact natur despit benefit machin learn face challeng black box issu process becom hard explain ai autom advanc concern job displac aris role may autom ai also creat new career path data scienc technolog stay relev individu need focu skill ai easili replic creativ critic think


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
