# Self Practice15 -Perform Text Preprocessing on your text​

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\AbdulAziz\AppData\Roaming\nltk_data...
[nltk_data]   Pack

True

In [3]:
# Load spacy model for NER
nlp = spacy.load("en_core_web_sm")

In [4]:
# Sample text
text = """
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language.
The ultimate objective of NLP is to read, decipher, understand, and make sense of human languages in a manner that is valuable.
"""

In [5]:
# Tokenization
tokens = word_tokenize(text)
tokens

['Natural',
 'Language',
 'Processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'artificial',
 'intelligence',
 'that',
 'focuses',
 'on',
 'the',
 'interaction',
 'between',
 'computers',
 'and',
 'humans',
 'through',
 'natural',
 'language',
 '.',
 'The',
 'ultimate',
 'objective',
 'of',
 'NLP',
 'is',
 'to',
 'read',
 ',',
 'decipher',
 ',',
 'understand',
 ',',
 'and',
 'make',
 'sense',
 'of',
 'human',
 'languages',
 'in',
 'a',
 'manner',
 'that',
 'is',
 'valuable',
 '.']

In [6]:
# Removing Stop Words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
filtered_tokens

['Natural',
 'Language',
 'Processing',
 '(',
 'NLP',
 ')',
 'field',
 'artificial',
 'intelligence',
 'focuses',
 'interaction',
 'computers',
 'humans',
 'natural',
 'language',
 '.',
 'ultimate',
 'objective',
 'NLP',
 'read',
 ',',
 'decipher',
 ',',
 'understand',
 ',',
 'make',
 'sense',
 'human',
 'languages',
 'manner',
 'valuable',
 '.']

In [7]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

['Natural',
 'Language',
 'Processing',
 '(',
 'NLP',
 ')',
 'field',
 'artificial',
 'intelligence',
 'focus',
 'interaction',
 'computer',
 'human',
 'natural',
 'language',
 '.',
 'ultimate',
 'objective',
 'NLP',
 'read',
 ',',
 'decipher',
 ',',
 'understand',
 ',',
 'make',
 'sense',
 'human',
 'language',
 'manner',
 'valuable',
 '.']

In [8]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
stemmed_tokens

['natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'field',
 'artifici',
 'intellig',
 'focus',
 'interact',
 'comput',
 'human',
 'natur',
 'languag',
 '.',
 'ultim',
 'object',
 'nlp',
 'read',
 ',',
 'deciph',
 ',',
 'understand',
 ',',
 'make',
 'sens',
 'human',
 'languag',
 'manner',
 'valuabl',
 '.']

In [9]:
# Parts of Speech Tagging
pos_tags = nltk.pos_tag(tokens)
pos_tags

[('Natural', 'JJ'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('field', 'NN'),
 ('of', 'IN'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('that', 'WDT'),
 ('focuses', 'VBZ'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('interaction', 'NN'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('humans', 'NNS'),
 ('through', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('ultimate', 'JJ'),
 ('objective', 'NN'),
 ('of', 'IN'),
 ('NLP', 'NNP'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('read', 'VB'),
 (',', ','),
 ('decipher', 'RB'),
 (',', ','),
 ('understand', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('make', 'VB'),
 ('sense', 'NN'),
 ('of', 'IN'),
 ('human', 'JJ'),
 ('languages', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('manner', 'NN'),
 ('that', 'WDT'),
 ('is', 'VBZ'),
 ('valuable', 'JJ'),
 ('.', '.')]

In [10]:
# Named Entity Recognition
doc = nlp(text)
named_entities = [(ent.text, ent.label_) for ent in doc.ents]
named_entities

[('Natural Language Processing (NLP', 'WORK_OF_ART'), ('NLP', 'ORG')]

In [11]:
# Count Vectorizer
vectorizer = CountVectorizer()
vectorizer.fit_transform([text])
vocab = vectorizer.vocabulary_
vocab

{'natural': 17,
 'language': 13,
 'processing': 22,
 'nlp': 18,
 'is': 12,
 'field': 5,
 'of': 20,
 'artificial': 1,
 'intelligence': 10,
 'that': 25,
 'focuses': 6,
 'on': 21,
 'the': 26,
 'interaction': 11,
 'between': 2,
 'computers': 3,
 'and': 0,
 'humans': 8,
 'through': 27,
 'ultimate': 29,
 'objective': 19,
 'to': 28,
 'read': 23,
 'decipher': 4,
 'understand': 30,
 'make': 15,
 'sense': 24,
 'human': 7,
 'languages': 14,
 'in': 9,
 'manner': 16,
 'valuable': 31}