#### Understanding List Comprehension
- [word for word in tokens if word.lower() not in stop_words]
- This is called List Comprehension in Python, which is a concise way to create a new list by 
iterating over an existing iterable and applying a condition.

#### Example: 
new_list = [expression for item in iterable of condition]

In [2]:
basket = ["apple", "banana", "orange", "kiwi", "grape"]
long_fruits = []
for fruit in basket:
 if len(fruit) > 5:
    long_fruits.append(fruit)
 
print(long_fruits)

['banana', 'orange']


In [3]:
# With List Comprehension
basket = ["apple", "banana", "orange", "kiwi", "grape"]
long_fruits = [fruit for fruit in basket if len(fruit) > 5]
print(long_fruits)

['banana', 'orange']


In [4]:
#Other example
numbers = [1, 2, 3, 4, 5]
doubled = [num * 2 for num in numbers]
print(doubled)
[2, 4, 6, 8, 10]

[2, 4, 6, 8, 10]


[2, 4, 6, 8, 10]

#### Removing Punctuation
Punctuation can often be noise in text processing tasks, so we will remove them.

In [None]:
import string

# Remove punctuation
reviews_no_punct = [[word for word in tokens if word not in string.punctuation] for tokens in filtered_tokens]
print(reviews_no_punct)

#### Removing Special Characters: 
Special characters such as “#”, “@”, or emoji may need to be removed or treated based on their relevance

In [None]:
import re

# Removing special characters
def remove_special_chars(tokens): return [re.sub(r'[^A-Za-z0-9]+', '', word) for word in tokens]
reviews_cleaned = [remove_special_chars(tokens) for tokens in filtered_tokens]
print(reviews_cleaned)

In [None]:
from nltk.stem import PorterStemmer # Stemming algorithm
from nltk.tokenize import word_tokenize # For breaking text into tokens

# The Porter Stemmer is a popular algorithm for stemming in English.
stemmer = PorterStemmer()

# Sample Text
text = "I am loving the process of learning and understanding NLP concepts."

# Tokenize the Text
tokens = word_tokenize(text)
print("Tokens:", tokens)

# apply stemming to each word in the list of tokens
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:", stemmed_words)

#### Wordnet:
- wordnet refers to the WordNet lexical database, a large database of English words developed by Princeton University. It groups words into sets of synonyms called synsets and provides semantic relationships between these sets.

In [None]:
# Import the Required Libraries
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# We need WordNet and Punkt for tokenization and lemmatization
nltk.download('wordnet') # For lexical database
nltk.download('omw-1.4') # Optional for extended multilingual support
nltk.download('punkt') # For tokenization

In [None]:
# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Sample text
text = "The leaves on the tree were falling. She was running quickly but got tired."

# Break sentence into words
tokens = word_tokenize(text)
print("Tokens:", tokens)

In [None]:
# apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized Words:", lemmatized_words)

#### Lemmatization with POS Tagging:
- When paired with Part-of-Speech (POS) tagging, the lemmatizer gains context about how the word is used (noun, verb, adjective, etc.), resulting in more accurate transformations.

In [None]:
#Lemmatization (with POS tagging)

# Import necessary libraries
from nltk.corpus import wordnet # for WordNet-compatible POS tags
from nltk.tag import pos_tag # to assign POS tags to words
from nltk.stem import WordNetLemmatizer # from NLTK for lemmatization

# Function to map POS tags to WordNet tags
def get_wordnet_pos(word): tag = pos_tag([word])[0][1][0].upper() # Get the POS tag's first letter
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

# dict ={ "key" : values, "key":value....}
return tag_dict.get(tag, wordnet.NOUN) # Default to noun if tag is not in the dictionary

#### Lemmatization with POS tagging

In [None]:
# Sample text
text = "The leaves are falling quickly from the trees, and the children are happily playing."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Perform lemmatization with POS tagging
lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

# Output the result
print("Original Tokens:", tokens)
print("Lemmatized Tokens:", lemmatized_tokens)

In [None]:
# Find synonyms of a word
synonyms = wordnet.synsets("run")
print(synonyms[0].definition())

# find synonyms and displays the definition of the first synonym for the word "run"
a score in baseball made by a runner touching all four bases safely

# Import libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, download
from nltk.corpus import wordnet

In [None]:
# Sample text
text = "The leaves are falling quickly from the trees, and the children are happily playing."

# Step 1: Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Step 2: Stop Word Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered:", filtered_tokens)

# Step 3: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed:", stemmed_tokens)

# Step 4: Lemmatization (without POS tagging)
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized (without POS):", lemmatized_tokens)

# Step 5: Lemmatization (with POS tagging)
def get_wordnet_pos(word):
tag = pos_tag([word])[0][1][0].upper() # Get the POS tag
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) # Default to noun
lemmatized_tokens_with_pos = [lemmatizer.lemmatize(word, 
get_wordnet_pos(word)) for word in filtered_tokens]
print("Lemmatized (with POS):", lemmatized_tokens_with_pos)