## Tokenization

In [1]:
!pip install nltk spacy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 3.7 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 3.2 MB/s eta 0:00:04
     ------- -------------------------------- 2.4/12.8 MB 3.3 MB/s eta 0:00:04
     ---------- ----------------------------- 3.4/12.8 MB 3.9 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 3.7 MB/s eta 0:00:03
     --------------- ------------------------ 5.0/12.8 MB 3.7 MB/s eta 0:00:03
     ------------------- -------------------- 6.3/12.8 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3

In [None]:
import sys
!{sys.executable} -m pip install spacy


In [None]:
!{sys.executable} -m spacy download en_core_web_sm


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
print("Model loaded successfully!")


In [5]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
text = "Hello! This is Sanika's NLP assignment. Let's compare tokenizers."

In [12]:
# NLTK Tokenizer
nltk_tokens = word_tokenize(text)
print("NLTK Tokens:")
print(nltk_tokens)

NLTK Tokens:
['Hello', '!', 'This', 'is', 'Sanika', "'s", 'NLP', 'assignment', '.']


In [13]:
# spaCy Tokenizer
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [14]:
spacy_tokens = [token.text for token in doc]
print("\nspaCy Tokens:")
print(spacy_tokens)



spaCy Tokens:
['Hello', '!', 'This', 'is', 'Sanika', "'s", 'NLP', 'assignment', '.']


## Stopwords Removal

In [15]:
from nltk.corpus import stopwords
nltk.download('stopwords')

text = "This is a simple example to demonstrate stopwords removal in NLP."

stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text)

filtered_text = [word for word in tokens if word.lower() not in stop_words]

print("Before:", tokens)
print("After:", filtered_text)


Before: ['This', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'stopwords', 'removal', 'in', 'NLP', '.']
After: ['simple', 'example', 'demonstrate', 'stopwords', 'removal', 'NLP', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lemmatization and stemming

In [17]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

text = "The students are studying and writing studies."

tokens = word_tokenize(text)

# Stemming
ps = PorterStemmer()
stemmed = [ps.stem(word) for word in tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

print("Original:", tokens)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original: ['The', 'students', 'are', 'studying', 'and', 'writing', 'studies', '.']
Stemmed: ['the', 'student', 'are', 'studi', 'and', 'write', 'studi', '.']
Lemmatized: ['The', 'student', 'are', 'studying', 'and', 'writing', 'study', '.']


## Handling Punctuation, Special Characters

In [21]:
#Clean text with regex + emojii library
import re
import emoji

text = "Hey @sanika!!! This is amazing  #NLP #AI 2026!!!"

print("Original:", text)

# Remove mentions & hashtags
text = re.sub(r'@\w+', '', text)
text = re.sub(r'#\w+', '', text)

# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)

# Remove emojis
text = emoji.replace_emoji(text, replace='')

print("Cleaned:", text)


Original: Hey @sanika!!! This is amazing  #NLP #AI 2026!!!
Cleaned: Hey  This is amazing    2026


In [20]:
!pip install emoji

Defaulting to user installation because normal site-packages is not writeable
Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
   ---------------------------------------- 0.0/608.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/608.4 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/608.4 kB ? eta -:--:--
   ---------------------------------------- 608.4/608.4 kB 1.9 MB/s  0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.15.0


## Lowercasing and Normalization

In [22]:
text = "   This   Is   MIXED Case   TEXT.   "

# Lowercase
text = text.lower()

# Normalize spacing
text = re.sub(r'\s+', ' ', text).strip()

print(text)


this is mixed case text.


# Regex for Text Cleaning

In [23]:
text = "Contact us at sanika@gmail.com or thynktech@company.co.in for details."

emails = re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text)

print("Emails Found:", emails)


Emails Found: ['sanika@gmail.com', 'thynktech@company.co.in']


## Remove Numbers 

In [24]:
text = "AI 2026 will change 500 industries."

no_numbers = re.sub(r'\d+', '', text)
print(no_numbers)


AI  will change  industries.


## Replace Multiple Spaces

In [28]:
text = "This    is    too     many   spaces."

cleaned = re.sub(r'\s+', ' ', text)
print(cleaned)


This is too many spaces.
