Perform all the preprocessing task after reading the text file.

Text Cleaning :
Convert text to lowercase.

Remove special characters and punctuation.
Remove extra white spaces.


Tokenization :
Split text into individual words or tokens.

Stop Words Removal:
Identify and remove common stop words (e.g., "and", "the", "is").
Create a custom list based on the context.

Stemming:
Reduce words to their root form (e.g., "running" to "run").
Use stemming algorithms like Porter or Snowball.

Normalization

Expand contractions (e.g., "don't" to "do not").
Eliminate URLs, email addresses, and other alphabetic elements (special characters).
Remove or handle numerical values if not relevant.

In [8]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
#import contractions

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None

class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        # Custom stop words (can be modified based on context)
        self.custom_stop_words = set(['ai', 'artificial', 'intelligence'])
        self.stop_words = set(stopwords.words('english')).union(self.custom_stop_words)

    def convert_lowercase(self, text):
        print("\n1. Converting to lowercase:")
        result = text.lower()
        print(result)
        return result

    def remove_special_chars(self, text):
        print("\n2. Removing special characters and punctuation:")
        # Keep apostrophes for contractions handling
        result = re.sub(r'[^a-zA-Z\s\']', ' ', text)
        print(result)
        return result

    def remove_extra_whitespace(self, text):
        print("\n3. Removing extra whitespace:")
        result = ' '.join(text.split())
        print(result)
        return result

    def tokenize_text(self, text):
        print("\n4. Tokenizing text:")
        tokens = word_tokenize(text)
        print(tokens)
        return tokens

    def remove_stop_words(self, tokens):
        print("\n5. Removing stop words:")
        filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words]
        print(filtered_tokens)
        return filtered_tokens

    def apply_stemming(self, tokens):
        print("\n6. Applying stemming:")
        stemmed_tokens = [self.stemmer.stem(word) for word in tokens]
        print(stemmed_tokens)
        return stemmed_tokens

    # def expand_contractions(self, text):
    #     print("\n7. Expanding contractions:")
    #     result = contractions.fix(text)
    #     print(result)
    #     return result
    def expand_contractions(self, text):
        print("\n7. Handling contractions:")
        try:
            result = contractions.fix(text)
        except:
            # Fallback method if contractions package fails
            print("Using fallback contraction handling...")
            contractions_dict = {
                "don't": "do not",
                "doesn't": "does not",
                "won't": "will not",
                "can't": "cannot",
                "isn't": "is not",
                "it's": "it is",
                "I'm": "I am",
                "aren't": "are not"
            }
            result = text
            for contraction, expansion in contractions_dict.items():
                result = result.replace(contraction.lower(), expansion)
        print(result)
        return result


    def remove_urls_emails(self, text):
        print("\n8. Removing URLs and emails:")
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        # Remove usernames (e.g., @user1)
        text = re.sub(r'@\w+', '', text)
        print(text)
        return text

    def remove_numbers(self, text):
        print("\n9. Removing numbers:")
        result = re.sub(r'\d+', '', text)
        print(result)
        return result

    def process_text(self, text):
        print("Original Text:")
        print(text)
        print("\nProcessing Steps:")

        # Step 1-3: Basic cleaning
        text = self.convert_lowercase(text)
        text = self.remove_urls_emails(text)
        text = self.expand_contractions(text)
        text = self.remove_special_chars(text)
        text = self.remove_numbers(text)
        text = self.remove_extra_whitespace(text)

        # Step 4: Tokenization
        tokens = self.tokenize_text(text)

        # Step 5: Stop words removal
        filtered_tokens = self.remove_stop_words(tokens)

        # Step 6: Stemming
        stemmed_tokens = self.apply_stemming(filtered_tokens)

        print("\nFinal Processed Text:")
        final_text = ' '.join(stemmed_tokens)
        print(final_text)
        return final_text

# Main execution block
if __name__ == "__main__":
    # For Google Colab, you might want to upload the file first
    from google.colab import files
    print("Please upload your input file...")
    uploaded = files.upload()

    # Get the filename of the uploaded file
    filename = next(iter(uploaded))

    # Read the content of the uploaded file
    text = read_file(filename)

    if text:
        # Initialize preprocessor
        preprocessor = TextPreprocessor()

        # Process the text
        processed_text = preprocessor.process_text(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Please upload your input file...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saving Assignment 2.txt to Assignment 2 (4).txt
Original Text:
"Artificial Intelligence (AI) has become a cornerstone of modern technology, influencing various industries from healthcare to finance. With advancements in machine learning, AI systems can now analyze massive datasets to uncover hidden patterns and provide insights that drive decision making. It don't need human engagement For example, AI powered algorithms @user1 are frequently used in financial markets to predict stock prices and in healthcare to ~! $$$ assist in diagnosing diseases with 45 high accuracy. However, as AI technology continues to evolve, it doesn't raises important @user2 questions about data privacy, the potential for job displacement, and the ethical implications of automating complex processes. In addition to these concerns, AI's impact on daily life is profound, with applications ranging from personalized recommendations on streaming services to smart home devices that make our 95 lives more convenient.