In [14]:
# NLP: Tokenization & Stopword Removal using spaCy. This is advanced as compared to NLTK
# While NLTK‚Äôs word_tokenize would simply split on whitespace and punctuation, spaCy‚Äôs hybrid approach keeps ‚ÄúspaCy‚Äîit's‚Äù as ["spaCy", "‚Äî", "it", "'s"] (or similar), and correctly preserves URLs, emoticons, and multi‚Äëword tokens you‚Äôve configured.

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.util import compile_infix_regex

nlp = English()  # blank pipeline

#  Step 1: View default infix patterns
#infix_patterns = nlp.Defaults.infixes

#  Step 2: Compile infix regex from patterns
#import re
#infix_re = re.compile("|".join(infix_patterns))

#   Step 2 (Optional): Modify infix rules (e.g., keep hyphenated words intact)
# This is faster code as compared to the above one. Now it generate less token and faster as compard to NLTK 
infixes = [x for x in nlp.Defaults.infixes if x != r"(?<=[A-Za-z0-9])[-‚Äì~](?=[A-Za-z0-9])"]
infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

text = "I'm loving spaCy‚Äîit's awesome! Visit https://example.com."


doc = nlp(text)
print([token.text for token in doc])

['I', "'m", 'loving', 'spaCy', '‚Äî', 'it', "'s", 'awesome', '!', 'Visit', 'https://example.com', '.']


In [None]:
#When Fewer Tokens Are Preferred
#in case of: Quick text classification, Keyword search or indexing	, Simpler BoW/TF-IDF features 
# High spead as compared to spaCy
 
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

#  Sample text
text = "I'm loving spaCy‚Äîit's awesome! Visit https://example.com."

#  Tokenize using NLTK
tokens = word_tokenize(text)

#  Output
print("üîπ NLTK Tokens:")
print(tokens)


üîπ NLTK Tokens:
['I', "'m", 'loving', 'spaCy‚Äîit', "'s", 'awesome', '!', 'Visit', 'https', ':', '//example.com', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
