<a href="https://colab.research.google.com/github/Teja3993/NLP_Lab/blob/main/NLP_Lab_Exercise_7_Stemming_using_various_Stemmers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Experiment 7: Stemming using Various Stemmers ---

import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer

def run_stemming_experiment():
    print("=== STEMMING ALGORITHMS COMPARISON ===\n")

    # ---------------------------------------------------------
    # 1. PORTER STEMMER
    # ---------------------------------------------------------
    print("--- 1. Porter Stemmer (Standard) ---")
    porter = PorterStemmer()

    # Dataset from PDF
    words_porter = ['Connects', 'Connecting', 'Connections', 'Connected',
                   'Connection', 'Connectings', 'Connect']

    for word in words_porter:
        stem = porter.stem(word)
        print(f"{word} ---> {stem}")

    # ---------------------------------------------------------
    # 2. SNOWBALL STEMMER
    # ---------------------------------------------------------
    print("\n--- 2. Snowball Stemmer (Porter 2.0) ---")
    # Note: Snowball requires specifying the language
    snowball = SnowballStemmer(language='english')

    words_snowball = ['generous', 'generate', 'generously', 'generation']

    for word in words_snowball:
        stem = snowball.stem(word)
        print(f"{word} ---> {stem}")

    # ---------------------------------------------------------
    # 3. LANCASTER STEMMER
    # ---------------------------------------------------------
    print("\n--- 3. Lancaster Stemmer (Aggressive) ---")
    lancaster = LancasterStemmer()

    words_lancaster = ['eating', 'eats', 'eaten', 'puts', 'putting']

    for word in words_lancaster:
        stem = lancaster.stem(word)
        print(f"{word} ---> {stem}")

    # ---------------------------------------------------------
    # 4. REGEXP STEMMER
    # ---------------------------------------------------------
    print("\n--- 4. Regular Expression Stemmer (Custom Rules) ---")
    # Logic: Remove 'ing', 's', or 'able' from the end ($) of the word.
    # min=4 means: Don't stem if the word is shorter than 4 characters.
    regexp = RegexpStemmer('ing$|s$|able$', min=4)

    words_regex = ['mass', 'was', 'bee', 'computer', 'advisable']

    for word in words_regex:
        stem = regexp.stem(word)
        print(f"{word} ---> {stem}")

# Run the experiment
run_stemming_experiment()

=== STEMMING ALGORITHMS COMPARISON ===

--- 1. Porter Stemmer (Standard) ---
Connects ---> connect
Connecting ---> connect
Connections ---> connect
Connected ---> connect
Connection ---> connect
Connectings ---> connect
Connect ---> connect

--- 2. Snowball Stemmer (Porter 2.0) ---
generous ---> generous
generate ---> generat
generously ---> generous
generation ---> generat

--- 3. Lancaster Stemmer (Aggressive) ---
eating ---> eat
eats ---> eat
eaten ---> eat
puts ---> put
putting ---> put

--- 4. Regular Expression Stemmer (Custom Rules) ---
mass ---> mas
was ---> was
bee ---> bee
computer ---> computer
advisable ---> advis
