In [None]:
############################################## Stemming and Lemmatization ######################################################

In [35]:
# Import necessary libraries from nltk for natural language processing.
# nltk.stem contains modules for stemming, and nltk.tokenize contains modules for tokenization.
# PorterStemmer is a specific stemming algorithm, and word_tokenize is a function for splitting text into words.
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download the 'punkt' tokenizer models. This is required for word_tokenize to work.
# 'punkt' is a pre-trained model that helps in splitting text into sentences and words.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [36]:
# Initialize the PorterStemmer.
# This creates an instance of the stemmer object that will be used to perform stemming on words.
stemmer = PorterStemmer()

In [37]:
# Define a sample sentence to demonstrate tokenization and stemming.
# This sentence will be used as input for the subsequent NLP operations.
sentence= "The runner were running in a race and they ran very fast"

In [38]:
# Demonstrate stemming on a single word.
# The stemmer.stem() method reduces the word "history" to its root form "histori".
# This shows how the PorterStemmer algorithm works on individual words.
stemmer.stem("history")

'histori'

In [43]:
# Tokenize the sample sentence into individual words.
# word_tokenize from nltk.tokenize splits the sentence string into a list of words.
# This is a necessary step to process text word by word.
tokens= word_tokenize(sentence)
# Display the list of tokens.
tokens

['The',
 'runner',
 'were',
 'running',
 'in',
 'a',
 'race',
 'and',
 'they',
 'ran',
 'very',
 'fast']

In [44]:
# Apply stemming to each word in the tokenized list.
# This uses a list comprehension to iterate through the 'tokens' and apply the stemmer.stem() method.
stemmer_word = [stemmer.stem(word) for word in tokens]
# Print the list of stemmed words.
print(stemmer_word)

['the', 'runner', 'were', 'run', 'in', 'a', 'race', 'and', 'they', 'ran', 'veri', 'fast']


In [45]:
# Import necessary libraries for lemmatization.
# nltk.stem provides WordNetLemmatizer, and nltk.corpus provides WordNet data.
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
# Download the 'wordnet' corpus, which is needed for lemmatization.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
# Initialize the WordNetLemmatizer.
# This creates an instance of the lemmatizer object.
lemmatizer = WordNetLemmatizer()

In [47]:
# Apply lemmatization to each word in the tokens list, specifying the part of speech as verb ("v").
# This uses a list comprehension to iterate and apply lemmatizer.lemmatize() with the specified pos.
lemmatizer_word = [lemmatizer.lemmatize(word,pos="v") for word in tokens]

In [48]:
# Print the original tokens and the lemmatized words for comparison.
print(tokens)
print(lemmatizer_word)

['The', 'runner', 'were', 'running', 'in', 'a', 'race', 'and', 'they', 'ran', 'very', 'fast']
['The', 'runner', 'be', 'run', 'in', 'a', 'race', 'and', 'they', 'run', 'very', 'fast']


In [49]:
# Define a sample text containing HTML tags.
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

# Define a function to remove HTML tags using regular expressions.
import re
def remove_html_tags(text):
    # Define a regex pattern to match HTML tags.
    pattern = re.compile('<.*?>')
    # Use the sub() method to replace the matched tags with an empty string.
    return pattern.sub(r'',text)

# Print the text after removing HTML tags.
print(remove_html_tags(text))

 Movie 1 Actor - Aamir Khan Click here to download


In [50]:
# Define a sample text containing a URL to demonstrate URL removal.
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

# Define a function to remove URLs from text using regular expressions.
import re # Import the regular expression module.
def remove_url(text):
    # Define a regex pattern to match URLs (http, https, or www).
    pattern = re.compile(r'https?://\S+|www\.\S+')
    # Use sub() to replace matched URLs with an empty string.
    return pattern.sub(r'',text)

# Print the text after removing the URL.
print(remove_url(text))

<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='


In [40]:
# Define a dictionary of common chat words and their full expansions.
# This dictionary is used to convert informal chat abbreviations into their formal forms
# to improve text readability and consistency for natural language processing tasks.
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

In [41]:
# Define a function called chat_conversion that takes a text string as input.
# This function iterates through the words in the input text and replaces
# any chat abbreviations found in the chat_words dictionary with their full forms.
def chat_conversion(text):
    new_text = [] # Initialize an empty list to store the converted words.
    # Split the input text into individual words using spaces as delimiters
    # and iterate through each word.
    for w in text.split():
        # Convert the current word to uppercase and check if it exists as a key
        # in the chat_words dictionary.
        if w.upper() in chat_words:
            # If the uppercase word is found in the dictionary, append its corresponding
            # full form (value) to the new_text list.
            new_text.append(chat_words[w.upper()])
        else:
            # If the word is not found in the dictionary, append the original word
            # to the new_text list without any changes.
            new_text.append(w)
    # Join all the words in the new_text list back into a single string,
    # separated by spaces, and return the resulting string.
    return " ".join(new_text)

# Demonstrate the usage of the chat_conversion function with example inputs
# and print the converted text.
print(chat_conversion("ASAP"))
print(chat_conversion("FAQ"))

As Soon As Possible
Frequently Asked Questions


In [42]:
# Demonstrate the usage of the chat_conversion function with a sentence
# that includes a chat word ("ASAP") and print the converted sentence.
print(chat_conversion('Do this work ASAP'))

Do this work As Soon As Possible
