# Case Folding

# Using Python's `.lower()` Method

In [1]:
text = "The Stock Market is Rising, and Investors Are Excited!"
lower_text = text.lower()

print(lower_text)
# Output: "the stock market is rising, and investors are excited!"


the stock market is rising, and investors are excited!


# Stop-word Removal

## Using NLTK

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vitali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vitali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = "The stock market is going up significantly in recent days."

# Tokenization
tokens = word_tokenize(text)

# Remove stop words
filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]

print(filtered_words)
# Output: ['Stock', 'market', 'going', 'significantly', 'recent', 'days']

['stock', 'market', 'going', 'significantly', 'recent', 'days', '.']


## Using SpaCy

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 6.8 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 6.8 MB/s eta 0:00:02
     ------------- -------------------------- 4.2/12.8 MB 6.8 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 6.8 MB/s eta 0:00:02
     ---------------------- ----------------- 7.1/12.8 MB 6.7 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 6.8 MB/s eta 0:00:01
     ------------------------------- -------- 10.0/12.8 MB 6.8 MB/s eta 0:00:01
     ------------------------------------ --- 11.5/12.8 MB 6.7 MB/s eta 0:00:01
     ---------------------------------------  12.6/12.8 MB 6.7 MB/s eta 0:00:01
     ---------------------------------

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("The stock market is going up significantly in recent days.")

# Remove stop words
filtered_words = [token.text for token in doc if not token.is_stop]

print(filtered_words)
# Output: ['Stock', 'market', 'going', 'significantly', 'recent', 'days']

['stock', 'market', 'going', 'significantly', 'recent', 'days', '.']


# Punctuation Handling

## Removing Punctuation (Using Regex)

In [7]:
import re

text = "Stock prices are rising! Investors are excited."

# Remove punctuation
clean_text = re.sub(r'[^\w\s]', '', text)

print(clean_text)
# Output: "Stock prices are rising Investors are excited"

Stock prices are rising Investors are excited


## Removing Punctuation (Using NLTK)

In [8]:
from nltk.tokenize import word_tokenize
import string

text = "Stock prices are rising! Investors are excited."

# Tokenize and remove punctuation
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word not in string.punctuation]

print(filtered_tokens)
# Output: ['Stock', 'prices', 'are', 'rising', 'Investors', 'are', 'excited']

['Stock', 'prices', 'are', 'rising', 'Investors', 'are', 'excited']


## Keeping Punctuation for Sentiment Analysis

In [9]:
text = "This is amazing!!!"

# Keeping exclamation marks
tokens = [word for word in word_tokenize(text) if word in string.punctuation or word.isalnum()]

print(tokens)
# Output: ['This', 'is', 'amazing', '!!!']


['This', 'is', 'amazing', '!', '!', '!']
