<a href="https://colab.research.google.com/github/SfurtiR/Natural-Language-Processing/blob/main/Text_Preprocessing_in_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Tokenization**

**Import Libraries**

In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import spacy

**Sample Text**

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Natural Language Processing is amazing! It allows computers to understand human language."


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


**Tokenization (Splitting text into words and sentences)**

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)

# Word Tokenization
words = word_tokenize(text)
print("Word Tokenization:", words)


Sentence Tokenization: ['Natural Language Processing is amazing!', 'It allows computers to understand human language.']
Word Tokenization: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'allows', 'computers', 'to', 'understand', 'human', 'language', '.']


In [None]:
pos_tag(words)

[('Natural', 'JJ'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('is', 'VBZ'),
 ('amazing', 'JJ'),
 ('!', '.'),
 ('It', 'PRP'),
 ('allows', 'VBZ'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('human', 'JJ'),
 ('language', 'NN'),
 ('.', '.')]

**Removing Stopwords**

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("After Stopword Removal:", filtered_words)


After Stopword Removal: ['Natural', 'Language', 'Processing', 'amazing', '!', 'allows', 'computers', 'understand', 'human', 'language', '.']


**Lemmatization (Reducing words to their base form)**

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("After Lemmatization:", lemmatized_words)


After Lemmatization: ['Natural', 'Language', 'Processing', 'amazing', '!', 'allows', 'computer', 'understand', 'human', 'language', '.']


**Named Entity Recognition (NER) Using spaCy**




In [4]:
nlp = spacy.load("en_core_web_sm")


In [5]:
doc = nlp(text)
if doc.ents:
  for ent in doc.ents:
    print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)) )
else: print("No named entities found.")


NLP - 0 - 3 - ORG - Companies, agencies, institutions, etc.
Artifical Intelligent - 15 - 36 - ORG - Companies, agencies, institutions, etc.


**Lowercasing**

In [None]:
text = "Natural Language Processing is AMAZING!"
lower_text = text.lower()
print(lower_text)


natural language processing is amazing!


**Removing Punctuation & Special Characters**

In [None]:
import string

text = "Hello, world! NLP is fun."
clean_text = text.translate(str.maketrans('', '', string.punctuation))
print(clean_text)


Hello world NLP is fun


**Stemming**

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "flies", "easily", "fairly"]
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

['run', 'fli', 'easili', 'fairli']


**Part-of-Speech (POS) Tagging**

In [None]:
nltk.download('averaged_perceptron_tagger')

text = "John is playing football."
words = word_tokenize(text)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('John', 'NNP'), ('is', 'VBZ'), ('playing', 'VBG'), ('football', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/590.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


**Handling Emojis & Emoticons**

In [None]:
import emoji

text = "I love NLP! 😊"
print(emoji.demojize(text))

I love NLP! :smiling_face_with_smiling_eyes:


**Expanding Contractions**

In [None]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
from contractions import fix

text = "I'm learning NLP. You can't stop me!"
expanded_text = fix(text)
print(expanded_text)


I am learning NLP. You cannot stop me!


**Spelling Correction**

In [1]:
from textblob import TextBlob
text = "NLP is part of Artifical Intelligent."

corrected_text = TextBlob(text).correct()
print(corrected_text)


NLP is part of Artificial Intelligent.
