In [60]:
# import Libraries 

import nltk
import spacy
from nltk.corpus import stopwords
from nltk.util import ngrams

In [61]:
# Sample text

df=open("C:/Users/Pramoda A S/Desktop/AIML Documents/nlpEx.txt")
text=df.read()

print("Original Text:")
print(text)


Original Text:
ML model monitoring is the practice of tracking the performance of ML models in production to identify potential issues that can add negative business value. These practices help proactively monitor prediction quality issues, data relevance, model accuracy, and bias.

ML monitoring constitutes the subset of AI observability where it showcases a bigger picture with testing, validation, explainability, and exploring unforeseen failure modes.
The performance of ML models starts degrading over time. It can be due to data inconsistencies, skews, and drifts, making deployed models inaccurate and irrelevant. Appropriate ML monitoring helps identify precisely when the model performance started diminishing. Such proactive monitoring helps take required actions like retraining models or replacing models. It helps foster usersâ€™ trust in ML systems.



In [78]:
# 1. Tokenization (NLTK)

tokens = nltk.word_tokenize(text)
print("1. Tokens:")
print(tokens)


1. Tokens:
['ML', 'model', 'monitoring', 'is', 'the', 'practice', 'of', 'tracking', 'the', 'performance', 'of', 'ML', 'models', 'in', 'production', 'to', 'identify', 'potential', 'issues', 'that', 'can', 'add', 'negative', 'business', 'value', '.', 'These', 'practices', 'help', 'proactively', 'monitor', 'prediction', 'quality', 'issues', ',', 'data', 'relevance', ',', 'model', 'accuracy', ',', 'and', 'bias', '.', 'ML', 'monitoring', 'constitutes', 'the', 'subset', 'of', 'AI', 'observability', 'where', 'it', 'showcases', 'a', 'bigger', 'picture', 'with', 'testing', ',', 'validation', ',', 'explainability', ',', 'and', 'exploring', 'unforeseen', 'failure', 'modes', '.', 'The', 'performance', 'of', 'ML', 'models', 'starts', 'degrading', 'over', 'time', '.', 'It', 'can', 'be', 'due', 'to', 'data', 'inconsistencies', ',', 'skews', ',', 'and', 'drifts', ',', 'making', 'deployed', 'models', 'inaccurate', 'and', 'irrelevant', '.', 'Appropriate', 'ML', 'monitoring', 'helps', 'identify', 'precis

In [79]:
# 2. Stopword Removal

stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalnum()]
print("2. After Stopword Removal:")
print(filtered_tokens)


2. After Stopword Removal:
['ML', 'model', 'monitoring', 'practice', 'tracking', 'performance', 'ML', 'models', 'production', 'identify', 'potential', 'issues', 'add', 'negative', 'business', 'value', 'practices', 'help', 'proactively', 'monitor', 'prediction', 'quality', 'issues', 'data', 'relevance', 'model', 'accuracy', 'bias', 'ML', 'monitoring', 'constitutes', 'subset', 'AI', 'observability', 'showcases', 'bigger', 'picture', 'testing', 'validation', 'explainability', 'exploring', 'unforeseen', 'failure', 'modes', 'performance', 'ML', 'models', 'starts', 'degrading', 'time', 'due', 'data', 'inconsistencies', 'skews', 'drifts', 'making', 'deployed', 'models', 'inaccurate', 'irrelevant', 'Appropriate', 'ML', 'monitoring', 'helps', 'identify', 'precisely', 'model', 'performance', 'started', 'diminishing', 'proactive', 'monitoring', 'helps', 'take', 'required', 'actions', 'like', 'retraining', 'models', 'replacing', 'models', 'helps', 'foster', 'trust', 'ML', 'systems']


In [80]:
# 3. POS Tagging (spaCy)

doc = nlp(text)
print("3. POS Tagging:")
for token in doc:
    print(f"{token.text:<15} {token.pos_:<10} {token.tag_}")


3. POS Tagging:
ML              PROPN      NNP
model           NOUN       NN
monitoring      NOUN       NN
is              AUX        VBZ
the             DET        DT
practice        NOUN       NN
of              ADP        IN
tracking        VERB       VBG
the             DET        DT
performance     NOUN       NN
of              ADP        IN
ML              PROPN      NNP
models          NOUN       NNS
in              ADP        IN
production      NOUN       NN
to              PART       TO
identify        VERB       VB
potential       ADJ        JJ
issues          NOUN       NNS
that            PRON       WDT
can             AUX        MD
add             VERB       VB
negative        ADJ        JJ
business        NOUN       NN
value           NOUN       NN
.               PUNCT      .
These           DET        DT
practices       NOUN       NNS
help            AUX        VBP
proactively     ADV        RB
monitor         VERB       VB
prediction      NOUN       NN
quality         

In [84]:
# 4. Named Entity Recognition (NER)

print("4. Named Entities:")
for ent in doc.ents:
    print(f"{ent.text:<20} -> {ent.label_}")


# ORG → Organization (companies, institutions, agencies, etc.)
# Example: Google, UNICEF, Microsoft

# GPE → Geo-Political Entity (countries, cities, states, regions)
# Example: India, New York, Asia

4. Named Entities:
ML                   -> ORG
ML                   -> ORG
ML                   -> ORG
AI                   -> GPE
ML                   -> ORG
ML                   -> ORG


In [82]:
#5. N-gram Generation (Bigrams & Trigrams)
bigrams = list(ngrams(filtered_tokens, 2))
trigrams = list(ngrams(filtered_tokens, 3))

In [83]:
print("5. Bigrams:\n")
print(bigrams)
print("\n\n5. Trigrams:\n")
print(trigrams)

5. Bigrams:

[('ML', 'model'), ('model', 'monitoring'), ('monitoring', 'practice'), ('practice', 'tracking'), ('tracking', 'performance'), ('performance', 'ML'), ('ML', 'models'), ('models', 'production'), ('production', 'identify'), ('identify', 'potential'), ('potential', 'issues'), ('issues', 'add'), ('add', 'negative'), ('negative', 'business'), ('business', 'value'), ('value', 'practices'), ('practices', 'help'), ('help', 'proactively'), ('proactively', 'monitor'), ('monitor', 'prediction'), ('prediction', 'quality'), ('quality', 'issues'), ('issues', 'data'), ('data', 'relevance'), ('relevance', 'model'), ('model', 'accuracy'), ('accuracy', 'bias'), ('bias', 'ML'), ('ML', 'monitoring'), ('monitoring', 'constitutes'), ('constitutes', 'subset'), ('subset', 'AI'), ('AI', 'observability'), ('observability', 'showcases'), ('showcases', 'bigger'), ('bigger', 'picture'), ('picture', 'testing'), ('testing', 'validation'), ('validation', 'explainability'), ('explainability', 'exploring'),

In [52]:
# NLP Demonstration Program
# Covers Tokenization, Stopword Removal, POS Tagging, NER, and N-grams

import nltk
import spacy
from nltk.corpus import stopwords
from nltk.util import ngrams

# Download required resources (only first time)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
spacy.cli.download("en_core_web_sm")

# Load Spacy model for NER
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Apple is looking at buying U.K. startup for $1 billion. Steve Jobs founded Apple in 1976."

print("Original Text:")
print(text)
print("-" * 50)

# 1. Tokenization
tokens = nltk.word_tokenize(text)
print("1. Tokenization:")
print(tokens)
print("-" * 50)

# 2. Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("2. Stopword Removal:")
print(filtered_tokens)
print("-" * 50)

# 3. POS Tagging
pos_tags = nltk.pos_tag(filtered_tokens)
print("3. POS Tagging:")
print(pos_tags)
print("-" * 50)

# 4. Named Entity Recognition (NER) using Spacy
doc = nlp(text)
print("4. Named Entity Recognition (NER):")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)
print("-" * 50)

# 5. N-grams Generation (Bigrams and Trigrams)
print("5. N-grams:")
bigrams = list(ngrams(filtered_tokens, 2))
trigrams = list(ngrams(filtered_tokens, 3))
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)
print("-" * 50)

[nltk_data] Downloading package punkt to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pramoda A S\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Original Text:
Apple is looking at buying U.K. startup for $1 billion. Steve Jobs founded Apple in 1976.
--------------------------------------------------
1. Tokenization:
['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.', 'Steve', 'Jobs', 'founded', 'Apple', 'in', '1976', '.']
--------------------------------------------------
2. Stopword Removal:
['Apple', 'looking', 'buying', 'startup', 'billion', 'Steve', 'Jobs', 'founded', 'Apple']
--------------------------------------------------
3. POS Tagging:
[('Apple', 'NNP'), ('looking', 'VBG'), ('buying', 'VBG'), ('startup', 'NN'), ('billio

In [53]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Apple is looking to buy U.K. startup for $1 billion."

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Stopword removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

# NER
doc = nlp(text)
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)

# N-grams generation
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

Tokens: ['Apple', 'is', 'looking', 'to', 'buy', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.']
Filtered Tokens: ['Apple', 'looking', 'buy', 'U.K.', 'startup', '$', '1', 'billion', '.']
POS Tags: [('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('to', 'TO'), ('buy', 'VB'), ('U.K.', 'NNP'), ('startup', 'NN'), ('for', 'IN'), ('$', '$'), ('1', 'CD'), ('billion', 'CD'), ('.', '.')]
Named Entities:
Apple ORG
U.K. GPE
$1 billion MONEY
Bigrams: [('Apple', 'is'), ('is', 'looking'), ('looking', 'to'), ('to', 'buy'), ('buy', 'U.K.'), ('U.K.', 'startup'), ('startup', 'for'), ('for', '$'), ('$', '1'), ('1', 'billion'), ('billion', '.')]
Trigrams: [('Apple', 'is', 'looking'), ('is', 'looking', 'to'), ('looking', 'to', 'buy'), ('to', 'buy', 'U.K.'), ('buy', 'U.K.', 'startup'), ('U.K.', 'startup', 'for'), ('startup', 'for', '$'), ('for', '$', '1'), ('$', '1', 'billion'), ('1', 'billion', '.')]
