In [29]:
# to install nltk
!pip install nltk



In [30]:
# natural language processing toolkit
import nltk

In [31]:
# to access and use the pre-trained sentence tokenizer provided by NLTK in Python code to tokenize text into sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Tokenization**

In [32]:
text = "A confusion matrix is a matrix that summarizes the performance of a machine learning model on a set of test data.\
       It is often used to measure the performance of classification models, which aim to predict a categorical label for each input instance.\
       The matrix displays the number of true positives (TP),\
       true negatives (TN), false positives (FP), and false negatives (FN) produced by the model on the test data."

# Word tokenization
words = nltk.word_tokenize(text)
print("Word Tokenization:", words)

# Sentence tokenization
sentences = nltk.sent_tokenize(text)
print("Sentence Tokenization:", sentences)

Word Tokenization: ['A', 'confusion', 'matrix', 'is', 'a', 'matrix', 'that', 'summarizes', 'the', 'performance', 'of', 'a', 'machine', 'learning', 'model', 'on', 'a', 'set', 'of', 'test', 'data', '.', 'It', 'is', 'often', 'used', 'to', 'measure', 'the', 'performance', 'of', 'classification', 'models', ',', 'which', 'aim', 'to', 'predict', 'a', 'categorical', 'label', 'for', 'each', 'input', 'instance', '.', 'The', 'matrix', 'displays', 'the', 'number', 'of', 'true', 'positives', '(', 'TP', ')', ',', 'true', 'negatives', '(', 'TN', ')', ',', 'false', 'positives', '(', 'FP', ')', ',', 'and', 'false', 'negatives', '(', 'FN', ')', 'produced', 'by', 'the', 'model', 'on', 'the', 'test', 'data', '.']
Sentence Tokenization: ['A confusion matrix is a matrix that summarizes the performance of a machine learning model on a set of test data.', 'It is often used to measure the performance of classification models, which aim to predict a categorical label for each input instance.', 'The matrix displ

In [33]:
another_text = "Confusion matrix is a very popular measure used while solving classification problems.\
               It can be applied to binary classification as well as for multiclass classification problems."

# Word tokenization
other_words = nltk.word_tokenize(another_text)
print("Word Tokenization:", other_words)

Word Tokenization: ['Confusion', 'matrix', 'is', 'a', 'very', 'popular', 'measure', 'used', 'while', 'solving', 'classification', 'problems', '.', 'It', 'can', 'be', 'applied', 'to', 'binary', 'classification', 'as', 'well', 'as', 'for', 'multiclass', 'classification', 'problems', '.']


**Segmentation**

In [34]:
# to determine a suitable segment length by considering the average length of words in the text
words = text.split()
total_word_length = sum(len(word) for word in words)
average_word_length = total_word_length / len(words)
print("Average Word Length:", average_word_length)

Average Word Length: 4.9


In [35]:
# Simple segmentation of a text into segments of fixed length
def segment_text(text, segment_length):
    segments = [text[i:i + segment_length] for i in range(0, len(text), segment_length)]
    return segments

segment_length = 24

segments = segment_text(text, segment_length)
print("Text Segmentation:", segments)


Text Segmentation: ['A confusion matrix is a ', 'matrix that summarizes t', 'he performance of a mach', 'ine learning model on a ', 'set of test data.       ', 'It is often used to meas', 'ure the performance of c', 'lassification models, wh', 'ich aim to predict a cat', 'egorical label for each ', 'input instance.       Th', 'e matrix displays the nu', 'mber of true positives (', 'TP),       true negative', 's (TN), false positives ', '(FP), and false negative', 's (FN) produced by the m', 'odel on the test data.']


In [36]:
# Other way
import re
def sentence_segmentation(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences

# Sample corpus
my_text = "Confusion matrix is a very popular measure used while solving classification problems. It can be applied to binary classification as well as for multiclass classification problems."


# Perform sentence segmentation
segmented_sentences = sentence_segmentation(my_text)

# Print segmented sentences
for index, sentence in enumerate(segmented_sentences, 1):
    print(f"Sentence {index}: {sentence}")


Sentence 1: Confusion matrix is a very popular measure used while solving classification problems.
Sentence 2: It can be applied to binary classification as well as for multiclass classification problems.


**POS-Tagging**

In [37]:
# Required for parts of speech tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [38]:
# Tokens
tokens =  ['A', 'confusion', 'matrix', 'is', 'a', 'matrix', 'that', 'summarizes',
          'the', 'performance', 'of', 'a', 'machine', 'learning', 'model', 'on', 'a', 'set', 'of', 'test',
          'data', '.', 'It', 'is', 'often', 'used', 'to', 'measure', 'the', 'performance', 'of', 'classification',
          'models', ',', 'which', 'aim', 'to', 'predict', 'a', 'categorical', 'label', 'for', 'each', 'input', 'instance', '.',
          'The', 'matrix', 'displays', 'the', 'number', 'of', 'true', 'positives', '(', 'TP', ')', ',', 'true', 'negatives', '(', 'TN', ')',
          ',', 'false', 'positives', '(', 'FP', ')', ',', 'and', 'false', 'negatives', '(', 'FN', ')', 'produced', 'by', 'the', 'model', 'on',
          'the', 'test', 'data', '.']

# Parts of speech tagging
tagged = nltk.pos_tag(tokens)

# Print tagged tokens/words
print("POS-Tagging : " , tagged)

POS-Tagging :  [('A', 'DT'), ('confusion', 'NN'), ('matrix', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('matrix', 'NN'), ('that', 'WDT'), ('summarizes', 'VBZ'), ('the', 'DT'), ('performance', 'NN'), ('of', 'IN'), ('a', 'DT'), ('machine', 'NN'), ('learning', 'VBG'), ('model', 'NN'), ('on', 'IN'), ('a', 'DT'), ('set', 'NN'), ('of', 'IN'), ('test', 'NN'), ('data', 'NNS'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('often', 'RB'), ('used', 'VBN'), ('to', 'TO'), ('measure', 'VB'), ('the', 'DT'), ('performance', 'NN'), ('of', 'IN'), ('classification', 'NN'), ('models', 'NNS'), (',', ','), ('which', 'WDT'), ('aim', 'VBP'), ('to', 'TO'), ('predict', 'VB'), ('a', 'DT'), ('categorical', 'JJ'), ('label', 'NN'), ('for', 'IN'), ('each', 'DT'), ('input', 'NN'), ('instance', 'NN'), ('.', '.'), ('The', 'DT'), ('matrix', 'NN'), ('displays', 'VBZ'), ('the', 'DT'), ('number', 'NN'), ('of', 'IN'), ('true', 'JJ'), ('positives', 'NNS'), ('(', '('), ('TP', 'NNP'), (')', ')'), (',', ','), ('true', 'JJ'), ('negatives

**Lemmatization**

In [39]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
text = "My name is Evelyn. I am studying in pucit. I like to do programming. I have participated in many competitions."

tokens = nltk.word_tokenize(text)
# Print Lemmatized words/tokens
for token in tokens:
  print("Lemmatized_words : ", wordnet_lemmatizer.lemmatize(token , pos = "v"))

Lemmatized_words :  My
Lemmatized_words :  name
Lemmatized_words :  be
Lemmatized_words :  Evelyn
Lemmatized_words :  .
Lemmatized_words :  I
Lemmatized_words :  be
Lemmatized_words :  study
Lemmatized_words :  in
Lemmatized_words :  pucit
Lemmatized_words :  .
Lemmatized_words :  I
Lemmatized_words :  like
Lemmatized_words :  to
Lemmatized_words :  do
Lemmatized_words :  program
Lemmatized_words :  .
Lemmatized_words :  I
Lemmatized_words :  have
Lemmatized_words :  participate
Lemmatized_words :  in
Lemmatized_words :  many
Lemmatized_words :  competitions
Lemmatized_words :  .


In [41]:
wordnet_lemmatizer = WordNetLemmatizer()
tokens = ["playing", "programming", "jumped", "ate"]
# Print Lemmatized words/tokens
for token in tokens:
  print("Lemmatized_words : ", wordnet_lemmatizer.lemmatize(token , pos = "v"))

Lemmatized_words :  play
Lemmatized_words :  program
Lemmatized_words :  jump
Lemmatized_words :  eat


**Stemming**

In [42]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Sample text
text = "My name is Evelyn. I am studying in pucit. I like to do programming. I have participated in many competitions."

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

for token in tokens:
  print("Stemmed_words : ", stemmer.stem(token))


Stemmed_words :  my
Stemmed_words :  name
Stemmed_words :  is
Stemmed_words :  evelyn
Stemmed_words :  .
Stemmed_words :  i
Stemmed_words :  am
Stemmed_words :  studi
Stemmed_words :  in
Stemmed_words :  pucit
Stemmed_words :  .
Stemmed_words :  i
Stemmed_words :  like
Stemmed_words :  to
Stemmed_words :  do
Stemmed_words :  program
Stemmed_words :  .
Stemmed_words :  i
Stemmed_words :  have
Stemmed_words :  particip
Stemmed_words :  in
Stemmed_words :  mani
Stemmed_words :  competit
Stemmed_words :  .


In [43]:
stemmer = PorterStemmer()
tokens = ["playing", "programming", "jumped", "ate"]
for token in tokens:
  print("Stemmed_words : ", stemmer.stem(token))


Stemmed_words :  play
Stemmed_words :  program
Stemmed_words :  jump
Stemmed_words :  ate


**Levenshtein distance**

In [44]:
!pip install Levenshtein



In [45]:
from Levenshtein import distance

# calculating Levenshtein distance for converting one list to another list
list1 = ['A', 'confusion', 'matrix', 'is', 'a', 'matrix', 'that', 'summarizes',
          'the', 'performance', 'of', 'a', 'machine', 'learning', 'model', 'on', 'a', 'set', 'of', 'test',
          'data', '.', 'It', 'is', 'often', 'used', 'to', 'measure', 'the', 'performance', 'of', 'classification',
          'models', ',', 'which', 'aim', 'to', 'predict', 'a', 'categorical', 'label', 'for', 'each', 'input', 'instance', '.',
          'The', 'matrix', 'displays', 'the', 'number', 'of', 'true', 'positives', '(', 'TP', ')', ',', 'true', 'negatives', '(', 'TN', ')',
          ',', 'false', 'positives', '(', 'FP', ')', ',', 'and', 'false', 'negatives', '(', 'FN', ')', 'produced', 'by', 'the', 'model', 'on',
          'the', 'test', 'data', '.']
list2 = ['Confusion', 'matrix', 'is', 'a', 'very', 'popular', 'measure', 'used', 'while', 'solving',
         'classification', 'problems', '.', 'It', 'can', 'be', 'applied', 'to', 'binary', 'classification',
         'as', 'well', 'as', 'for', 'multiclass', 'classification', 'problems', '.']

distance_between_lists = distance(list1, list2)
print("Levenshtein Distance:", distance_between_lists)

Levenshtein Distance: 76


In [46]:
# calculating Levenshtein distance for converting one word to another word
word1 = "make"
word2 = "made"

distance_between_words = distance(word1, word2)
print("Levenshtein Distance:", distance_between_words)

Levenshtein Distance: 1


**Edit Distance**

In [47]:
# calculating Edit distance for converting one word to another word
word1 = "make"
word2 = "made"
distance_between_words = nltk.edit_distance(word1, word2)
print("Edit Distance:", distance_between_words)

Edit Distance: 1


**Byte pair encoding**

In [48]:
!pip install bpemb



In [49]:
from bpemb import BPEmb
# Load pre-trained BPEmb model
bpemb_en = BPEmb(lang="en", vs=10000)

text = "A confusion matrix is a matrix that summarizes the performance of a machine learning model on a set of test data.\
       It is often used to measure the performance of classification models, which aim to predict a categorical label for each input instance.\
       The matrix displays the number of true positives (TP),\
       true negatives (TN), false positives (FP), and false negatives (FN) produced by the model on the test data."
# Apply byte pair encoding to text
encoded_text = bpemb_en.encode(text)
print("Byte Pair Encoded Text:", encoded_text)

Byte Pair Encoded Text: ['▁a', '▁conf', 'usion', '▁matrix', '▁is', '▁a', '▁matrix', '▁that', '▁sum', 'mar', 'izes', '▁the', '▁performance', '▁of', '▁a', '▁machine', '▁learning', '▁model', '▁on', '▁a', '▁set', '▁of', '▁test', '▁data', '.', '▁it', '▁is', '▁often', '▁used', '▁to', '▁measure', '▁the', '▁performance', '▁of', '▁classification', '▁models', ',', '▁which', '▁aim', '▁to', '▁pred', 'ict', '▁a', '▁categ', 'or', 'ical', '▁label', '▁for', '▁each', '▁input', '▁instance', '.', '▁the', '▁matrix', '▁displays', '▁the', '▁number', '▁of', '▁true', '▁pos', 'it', 'ives', '▁(', 't', 'p', '),', '▁true', '▁neg', 'atives', '▁(', 't', 'n', '),', '▁false', '▁pos', 'it', 'ives', '▁(', 'f', 'p', '),', '▁and', '▁false', '▁neg', 'atives', '▁(', 'f', 'n', ')', '▁produced', '▁by', '▁the', '▁model', '▁on', '▁the', '▁test', '▁data', '.']


In [50]:
# Another way
import re
from collections import defaultdict

def get_stats(vocab):
	"""
	Given a vocabulary (dictionary mapping words to frequency counts), returns a
	dictionary of tuples representing the frequency count of pairs of characters
	in the vocabulary.
	"""
	pairs = defaultdict(int)
	for word, freq in vocab.items():
		symbols = word.split()
		for i in range(len(symbols)-1):
			pairs[symbols[i],symbols[i+1]] += freq
	return pairs

def merge_vocab(pair, v_in):
	"""
	Given a pair of characters and a vocabulary, returns a new vocabulary with the
	pair of characters merged together wherever they appear.
	"""
	v_out = {}
	bigram = re.escape(' '.join(pair))
	p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
	for word in v_in:
		w_out = p.sub(''.join(pair), word)
		v_out[w_out] = v_in[word]
	return v_out

def get_vocab(data):
	"""
	Given a list of strings, returns a dictionary of words mapping to their frequency
	count in the data.
	"""
	vocab = defaultdict(int)
	for line in data:
		for word in line.split():
			vocab[' '.join(list(word)) + ' </w>'] += 1
	return vocab

def byte_pair_encoding(data, n):
	"""
	Given a list of strings and an integer n, returns a list of n merged pairs
	of characters found in the vocabulary of the input data.
	"""
	vocab = get_vocab(data)
	for i in range(n):
		pairs = get_stats(vocab)
		best = max(pairs, key=pairs.get)
		vocab = merge_vocab(best, vocab)
	return vocab

# Example usage:
corpus = "A confusion matrix is a matrix that summarizes the performance of a machine learning model on a set of test data.\
       #It is often used to measure the performance of classification models, which aim to predict a categorical label for each input instance.\
       #The matrix displays the number of true positives (TP),\
       #true negatives (TN), false positives (FP), and false negatives (FN) produced by the model on the test data."
data = corpus.split('.')

n = 150
bpe_pairs = byte_pair_encoding(data, n)
bpe_pairs

{'A</w>': 1,
 'confusion</w>': 1,
 'matrix</w>': 3,
 'is</w>': 2,
 'a</w>': 4,
 'that</w>': 1,
 'summarizes</w>': 1,
 'the</w>': 5,
 'performance</w>': 2,
 'of</w>': 4,
 'machine</w>': 1,
 'learning</w>': 1,
 'model</w>': 2,
 'on</w>': 2,
 'set</w>': 1,
 'test</w>': 2,
 'data</w>': 2,
 '#It</w>': 1,
 'often</w>': 1,
 'used</w>': 1,
 'to</w>': 2,
 'measure</w>': 1,
 'classification</w>': 1,
 'models,</w>': 1,
 'which</w>': 1,
 'aim</w>': 1,
 'predict</w>': 1,
 'categorical</w>': 1,
 'label</w>': 1,
 'for</w>': 1,
 'each</w>': 1,
 'input</w>': 1,
 'instance</w>': 1,
 '#The</w>': 1,
 'di s p la y s</w>': 1,
 'n u m b er </w>': 1,
 'true</w>': 1,
 'positives</w>': 2,
 '(T P),</w>': 1,
 '# true</w>': 1,
 'negatives</w>': 2,
 '(T N ),</w>': 1,
 'false</w>': 2,
 '(F P),</w>': 1,
 'a n d </w>': 1,
 '(F N ) </w>': 1,
 'pr od u c ed</w>': 1,
 'b y </w>': 1}

**Language Models & Perplexity**

In [51]:

from nltk import trigrams
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Sample text for training and testing
train_text = "My name is Ansha."
test_text = "My name is Ansha Sarwar."

# Tokenize text into words
train_tokens = word_tokenize(train_text.lower())
test_tokens = word_tokenize(test_text.lower())

# Create trigrams from the training data
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for word1, word2, word3 in trigrams(train_tokens, pad_right=True, pad_left=True):
    trigram_model[(word1, word2)][word3] += 1

# Convert trigram counts to probabilities
for word1_word2 in trigram_model:
    total_count = float(sum(trigram_model[word1_word2].values()))
    for word3 in trigram_model[word1_word2]:
        trigram_model[word1_word2][word3] /= total_count

# Calculate perplexity for the test data
log_prob = 0.0
word_count = 0
for i in range(len(test_tokens) - 2):
    word1 = test_tokens[i]
    word2 = test_tokens[i + 1]
    word3 = test_tokens[i + 2]
    if (word1, word2) in trigram_model:
        if word3 in trigram_model[(word1, word2)]:
            log_prob += -1 * (trigram_model[(word1, word2)][word3])
            word_count += 1

perplexity = 2 ** (log_prob / word_count)
print(f"Perplexity: {perplexity}")


Perplexity: 0.5


In [52]:
# Other way
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace

# Sample text
text = "My name is Ansha."
sentences = nltk.sent_tokenize(text)

# Tokenize the sentences into words
tokenized_text = [nltk.word_tokenize(sentence) for sentence in sentences]

# Convert the tokenized text into n-grams (you can set n to your desired value)
n = 3  # Change this to set the n-gram size

# Create n-gram model and vocabulary
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
lm = Laplace(n)  # Using Laplace smoothing
lm.fit(train_data, padded_vocab)

# Test sentence
test_sentence = "My name is Ansha Sarwar."
test_tokens = nltk.word_tokenize(test_sentence)

# Calculate perplexity
test_data = list(nltk.ngrams(test_tokens, n))  # Convert to a list of n-grams
perplexity = lm.perplexity(test_data)
print("Perplexity:", perplexity)

Perplexity: 6.179301431721354


In [53]:
# Other way
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [54]:
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

text = "A confusion matrix is a matrix that summarizes the performance of a machine learning model on a set of test data.\
       It is often used to measure the performance of classification models, which aim to predict a categorical label for each input instance.\
       The matrix displays the number of true positives (TP),\
       true negatives (TN), false positives (FP), and false negatives (FN) produced by the model on the test data."

# Tokenize the input text and convert it into PyTorch tensors
inputs = tokenizer.encode(text, return_tensors='pt')

# Generate outputs using the GPT-2 model and calculate the loss
outputs = model(inputs, labels=inputs)
loss = outputs.loss

# Calculate perplexity from the computed loss
perplexity = 2 ** loss.item()

print("Perplexity:", perplexity)


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Perplexity: 6.712799839565755


**Regular Expression Matching**

In [56]:
import re

# Regular expression patterns
patterns = [
    re.compile(r'[a-zA-Z]+'),  # Alphabetic strings
    re.compile(r'^[a-z]*b$'),  # Lowercase alphabetic strings ending in 'b'
    re.compile(r'^(b+(ab+)+)$'),  # Strings with 'a' surrounded by 'b's
    re.compile(r'\b([a-zA-Z]+)\s+\1\b'),  # Strings with two consecutive repeated words
    re.compile(r'^\d+\s\w+$'),  # Strings starting with an integer and ending with a word
    re.compile(r'\bgrotto\b.*\braven\b|\braven\b.*\bgrotto\b'),  # Strings with 'grotto' and 'raven'
    re.compile(r'^\W*(\w+)$')  # First word of an English sentence
]

# Test strings
tests = [
    ['hye', '123', '123ai', 'Evelyn', 'wow!!'],
    ['gab', 'Evelyn', 'avab', 'Evelyn!'],
    ['baba', 'babababa', 'abab', 'Uffff!'],
    ['Evelyn Evelyn', 'hye hye', 'the tree', 'good good bad'],
    ['505 Evelyn', '05 RollNo', '5Evelyn', 'not 505 Evelyn', 'Yes! Yes!'],
    ['grotto raven', 'raven grotto', 'Grottos and ravens', 'Wow! Wow!'],
    ['My name is Evelyn', '....so cool', 'Wow!']
]

# Check each string against respective pattern
for idx, pattern in enumerate(patterns, start=1):
    print(f"Pattern {idx}:")
    for test in tests[idx - 1]:
        if pattern.match(test):
            print(f"'{test}' matches the pattern.")
        else:
            print(f"'{test}' does not match the pattern.")
    print()


Pattern 1:
'hye' matches the pattern.
'123' does not match the pattern.
'123ai' does not match the pattern.
'Evelyn' matches the pattern.
'wow!!' matches the pattern.

Pattern 2:
'gab' matches the pattern.
'Evelyn' does not match the pattern.
'avab' matches the pattern.
'Evelyn!' does not match the pattern.

Pattern 3:
'baba' does not match the pattern.
'babababa' does not match the pattern.
'abab' does not match the pattern.
'Uffff!' does not match the pattern.

Pattern 4:
'Evelyn Evelyn' matches the pattern.
'hye hye' matches the pattern.
'the tree' does not match the pattern.
'good good bad' matches the pattern.

Pattern 5:
'505 Evelyn' matches the pattern.
'05 RollNo' matches the pattern.
'5Evelyn' does not match the pattern.
'not 505 Evelyn' does not match the pattern.
'Yes! Yes!' does not match the pattern.

Pattern 6:
'grotto raven' matches the pattern.
'raven grotto' matches the pattern.
'Grottos and ravens' does not match the pattern.
'Wow! Wow!' does not match the pattern.

P