## Text Preprocessing Pipeline Code.
### This notebook contains the code for complete text preprocessing.
#### Steps followed in preprocessing:
    1. Decoding or removing encoding
    2. Lower casing
    3. Convert digits to words
    4. Remove punctuations and other special characters
    5. Spelling corrections
    6. Remove stop words
    7. Stemming
    8. Lemmatization

In [1]:
# Install the libraries
# !pip install nltk num2words autocorrect

In [2]:
import re
from num2words import num2words
import nltk 
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rmali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rmali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def remove_encoding(text):
    """
    Removes encoding tags from the input text and returns a clean string.

    This function identifies and removes any text enclosed in angle brackets,
    such as <SUBJECT LINE>, <END>, <BODY TEXT>, etc. It replaces these tags
    with a single space to maintain proper separation of content. The resulting
    text is cleaned up by removing any excess whitespace and ensuring it remains 
    in a single line.

    Args:
        text (str): The input string containing encoded tags.

    Returns:
        str: The cleaned text with encoding tags removed and content in a single line.
    """
    # Replace any tag enclosed in angle brackets with a single space
    cleaned_text = re.sub(r'<[^>]+>', ' ', text)
    
    # Remove any excess whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

def digits_to_words(match):
    """
    Converts numeric digits in a string to their corresponding words. 
    Handles both ordinal and cardinal numbers.

    Args:
        match (re.Match): A regular expression match object containing the numeric string.

    Returns:
        str: The number converted to words, in either ordinal or cardinal form.
    """
    suffixes = ['st', 'nd', 'rd', 'th']
    string = match[0].lower()
    if string[-2:] in suffixes:
        type = 'ordinal'
        string = string[:-2]
    else:
        type = 'cardinal'

    return num2words(string, to=type)


def spelling_correction(text):
    """
    Corrects the spelling of words in the given text.

    Args:
        text (str): The input text that may contain misspelled words.

    Returns:
        str: The text with corrected spelling.
    """
    corrector = Speller()
    spells = [corrector(word) for word in text.split()]
    return " ".join(spells)


def remove_stop_words(text):
    """
    Removes common English stop words from the input text.

    Args:
        text (str): The input text that may contain stop words.

    Returns:
        str: The text with stop words removed.
    """
    stopwords_set = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word not in stopwords_set])


def stemming(text):
    """
    Applies stemming to the words in the input text, reducing them to their base or root form.

    Args:
        text (str): The input text that may contain words to be stemmed.

    Returns:
        str: The text with words stemmed.
    """
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])


def lemmatizing(text):
    """
    Applies lemmatization to the words in the input text, converting them to their base or dictionary form.

    Args:
        text (str): The input text that may contain words to be lemmatized.

    Returns:
        str: The text with words lemmatized.
    """
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [4]:
def text_preprocessing(input_text):
    """
    Applies a series of text preprocessing steps to the input text, including encoding removal, 
    case normalization, digit-to-word conversion, punctuation removal, spelling correction, 
    stop word removal, stemming, and lemmatization.

    Args:
        input_text (str): The raw text that needs to be preprocessed.

    Returns:
        str: The fully preprocessed text.
    """
    output = input_text

    # Step 1: Decoding or removing encoding
    output = remove_encoding(output)
    print("\nAfter decoding or removing encoding:\n    ", output)

    # Step 2: Lower casing
    output = output.lower()
    print("\nAfter lower casing:\n    ", output)

    # Step 3: Convert digits to words
    # The following regex syntax looks for matches of consecutive digits followed by an optional ordinal suffix
    output = re.sub(r'\d+(st)?(nd)?(rd)?(th)?', digits_to_words, output, flags=re.IGNORECASE)
    print("\nAfter converting digits to words\n    ", output)

    # Step 4: Remove punctuations and other special characters
    output = re.sub('[^ A-Za-z0-9]+', '', output)
    print("\nAfter removing punctuations and other special characters\n    ", output)

    # Step 5: Spelling corrections
    output = spelling_correction(output)
    print("\nAfter spelling corrections:\n    ", output)

    # Step 6: Remove stop words
    output = remove_stop_words(output)
    print("\nAfter removing stop words:\n    ", output)

    # Step 7: Stemming
    output = stemming(output)
    print("\nAfter stemming:\n    ", output)

    # Step 8: Lemmatizing
    output = lemmatizing(output)
    print("\nAfter lemmatization:\n    ", output)

    return output

In [5]:
raw_text = """
    "<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,\n1st one is pairoll, 2nd is healtcare!<END>"
    """
print(f"Raw Input Text:\n {raw_text}")
print("\n\nPreprocessing............")
preprocessed_text = text_preprocessing(raw_text)
print(f"\n................................\nThis is the preprocessed text:\n  {preprocessed_text}")

Raw Input Text:
 
    "<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,
1st one is pairoll, 2nd is healtcare!<END>"
    


Preprocessing............

After decoding or removing encoding:
     " Employees details Attached are 2 files, 1st one is pairoll, 2nd is healtcare! "

After lower casing:
     " employees details attached are 2 files, 1st one is pairoll, 2nd is healtcare! "

After converting digits to words
     " employees details attached are two files, first one is pairoll, second is healtcare! "

After removing punctuations and other special characters
      employees details attached are two files first one is pairoll second is healtcare 

After spelling corrections:
     employees details attached are two files first one is payroll second is healthcare

After removing stop words:
     employees details attached two files first one payroll second healthcare

After stemming:
     employe detail attach two file first one payrol second healthcar

After lemma