In [1]:
import spacy

In [2]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 5.1 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 5.5 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 5.0 MB/s eta 0:00:02
     --------------- ------------------------ 5.0/12.8 MB 5.7 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 5.7 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 5.0 MB/s eta 0:00:02
     ----------------------- ---------------- 7.6/12.8 MB 5.1 MB/s eta 0:00:02
     -------------------------- ------------- 8.4/12.8 

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
text = "Natural Language Processing is very powerful!"

doc = nlp(text)
tokens = [token.text for token in doc]

print("Original Text:")
print(text)

print("spaCy Tokens:")
print(tokens)

Original Text:
Natural Language Processing is very powerful!
spaCy Tokens:
['Natural', 'Language', 'Processing', 'is', 'very', 'powerful', '!']


In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [7]:
import nltk


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
text = "This is an example sentence to demonstrate stopwords removal in NLP"

tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))

filtered_tokens = [w for w in tokens if w.lower() not in stop_words]

print("Before:", tokens)
print("After:", filtered_tokens)


Before: ['This', 'is', 'an', 'example', 'sentence', 'to', 'demonstrate', 'stopwords', 'removal', 'in', 'NLP']
After: ['example', 'sentence', 'demonstrate', 'stopwords', 'removal', 'NLP']


In [10]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [11]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["studies", "running", "better", "children"]

print("Word | Stemmed | Lemmatized")
for word in words:
    print(word, "|", stemmer.stem(word), "|", lemmatizer.lemmatize(word))


Word | Stemmed | Lemmatized
studies | studi | study
running | run | running
better | better | better
children | children | child


In [13]:
import re
import emoji


In [14]:
text = "Wow!!! I love NLP üòçüî• #AI @OpenAI"

# Remove emojis
text_no_emoji = emoji.replace_emoji(text, replace='')

# Remove hashtags, mentions, punctuation
clean_text = re.sub(r'[@#]\w+|[^\w\s]', '', text_no_emoji)

print("Original Text:")
print(text)

print("\nCleaned Text:")
print(clean_text)


Original Text:
Wow!!! I love NLP üòçüî• #AI @OpenAI

Cleaned Text:
Wow I love NLP   


In [15]:
text = "   NLP    Is     AWESOME   "

normalized_text = re.sub(r'\s+', ' ', text.strip().lower())

print("Original Text:")
print(text)

print("\nNormalized Text:")
print(normalized_text)


Original Text:
   NLP    Is     AWESOME   

Normalized Text:
nlp is awesome


In [16]:
text = "Contact us at admin@gmail.com or support@yahoo.com. Call 9876543210"

# Extract emails
emails = re.findall(r'\S+@\S+', text)

# Remove numbers
text_no_numbers = re.sub(r'\d+', '', text)

# Replace multiple spaces
clean_text = re.sub(r'\s+', ' ', text_no_numbers)

print("Emails Found:")
print(emails)

print("\nCleaned Text:")
print(clean_text)


Emails Found:
['admin@gmail.com', 'support@yahoo.com.']

Cleaned Text:
Contact us at admin@gmail.com or support@yahoo.com. Call 
