In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
def tokenize(corpus):
    tokens = []
    temp = ""
    for char in corpus:
        if char != " ":
            temp += char
        else:
            if temp:
                tokens.append(temp)
                temp = ""
    if temp:
        tokens.append(temp)
    return tokens

In [3]:
def unigram(corpus):
  tokens = tokenize(corpus)
  distinct_tokens = set(tokens)
  uni_dict = {}
  for token in distinct_tokens:
    uni_dict.update({token: corpus.count(token)})
  return uni_dict

In [4]:
def plot_unigram(corpus):
  tokens = tokenize(corpus)
  plt.title("Unigram Distribution")
  sns.countplot(x=tokens)
  plt.figure(figsize=(10, 12))
  plt.show()
  return

In [5]:
def bi_tokenize(corpus):
  tokens = tokenize(corpus)
  bi_tokens = []
  for i in range(len(tokens) - 1):
      bi_tokens.append(tokens[i] + " " + tokens[i + 1])
  return bi_tokens

In [6]:
def bigram(corpus):
  bi_tokens = bi_tokenize(corpus)
  distinct_bi_tokens = set(bi_tokens)
  bi_dict = {}
  for token in distinct_bi_tokens:
    bi_dict.update({token: corpus.count(token)})
  return bi_dict

In [7]:
def plot_bigram(corpus):
  tokens = bi_tokenize(corpus)
  plt.title("Bigram Distribution")
  sns.countplot(x=tokens)
  plt.figure(figsize=(10, 12))
  plt.show()
  return

In [8]:
def tri_tokenize(corpus):
  tokens = tokenize(corpus)
  tri_tokens = []
  for i in range(len(tokens) - 2):
    tri_tokens.append(tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2])
  return tri_tokens

In [9]:
def trigram(corpus):
  tri_tokens = tri_tokenize(corpus)
  distinct_tri_tokens = set(tri_tokens)
  tri_dict = {}
  for token in distinct_tri_tokens:
    tri_dict.update({token: corpus.count(token)})
  return tri_dict

In [10]:
def plot_trigram(corpus):
  tokens = tri_tokenize(corpus)
  plt.title("Trigram Distribution")
  sns.countplot(x=tokens)
  plt.figure(figsize=(10, 12))
  plt.show()
  return

In [11]:
def calculate_probability(sentence, corpus):
  tokens = tokenize(sentence)
  check = bi_tokenize(corpus)
  bi_tokens = bi_tokenize(sentence)
  bi = bigram(corpus)
  uni = unigram(corpus)
  prob = []
  for token in bi_tokens:
    if token in check:
      prob.append(float(bi[token]/ uni[tokenize(token)[0]]))
  return prob[len(prob) - 1]

In [12]:
def predict(sentence, corpus):
  tokens = tokenize(corpus)
  probabilities = []
  for token in tokens:
    probabilities.append(calculate_probability(sentence + " " + token, corpus))
  ind = probabilities.index(max(probabilities))
  return tokens[ind]

In [13]:
def create_bag_of_words(corpus):
    tokens = tokenize(corpus)
    vocabulary = set(tokens)
    for words in tokens:
        vocabulary.update(words)
    vocabulary = sorted(list(vocabulary))
    bow_vectors = []
    for words in tokens:
        vector = [0] * len(vocabulary)
        for word in words:
            if word in vocabulary:
                word_index = vocabulary.index(word)
                vector[word_index] += 1
        bow_vectors.append(vector)
    return vocabulary, bow_vectors

In [14]:
corpus = "The cat is sitting"
vocabulary, bow_vectors = create_bag_of_words(corpus)
for i, vector in enumerate(bow_vectors):
    print(f"Token {i+1} BoW Vector: {vector}")

Token 1 BoW Vector: [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]
Token 2 BoW Vector: [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Token 3 BoW Vector: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]
Token 4 BoW Vector: [0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 0, 2]


In [15]:
corpus = "the cat sat on the mat the dog barked loudly the cat chased the mouse the dog sat near the cat the mouse ran quickly"
sentence = "the dog barked"
predict(sentence, corpus)

'loudly'

In [16]:
corpus = "the cat sat on the mat the dog barked loudly the cat chased the mouse the dog sat near the cat the mouse ran quickly"
sentence = "the dog barked"
calculate_probability(sentence, corpus)

0.5

In [17]:
import nltk
from collections import Counter
from nltk import word_tokenize, bigrams, trigrams
nltk.download("punkt")

text = "the cat sat on the mat the dog barked loudly the cat chased the mouse the dog sat near the cat the mouse ran quickly"
t = word_tokenize(text.lower())
u, b, g = Counter(t), Counter(bigrams(t)), Counter(trigrams(t))

sent = "the cat".split()
prob_uni = (u[sent[0]]/len(t))*(u[sent[1]]/len(t))
prob_bi = b[tuple(sent)]/u[sent[0]]
next_word = max([(w2,c) for (w1,w2),c in b.items() if w1==sent[-1]], key=lambda x:x[1])[0]

print("Unigram P:", prob_uni, "Bigram P:", prob_bi, "Next:", next_word)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [18]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

corpus = "The cat is sitting"

# Tokenize
tokens = word_tokenize(corpus.lower())
vocabulary = sorted(set(tokens))

# Build BoW for each token
bow_vectors = []
for word in tokens:
    vector = [0] * len(vocabulary)
    vector[vocabulary.index(word)] = 1
    bow_vectors.append(vector)

# Print results
print("Vocabulary:", vocabulary)
for i, vector in enumerate(bow_vectors, 1):
    print(f"Token {i} BoW Vector: {vector}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
nltk.download()


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> 

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Package averaged_perceptron_tagger is already up-to-date!
       | Downloading package averaged_perceptron_tagger_eng to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package averaged_perceptron_tagger_rus to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_rus.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | 


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [19]:
sentence = "the teacher explained the difficult concepts to the students during the lecture"
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
grammar = r"""
  NP: {<DT>?<JJ>*<NN.*>}   # Noun Phrase
  VP: {<VB.*><NP|PP>*}     # Verb Phrase
  PP: {<IN><NP>}           # Prepositional Phrase
"""
cp = nltk.RegexpParser(grammar)
tree = cp.parse(pos_tags)
for subtree in tree.subtrees():
    if subtree.label() in ["NP", "VP", "PP"]:
        print(f"{subtree.label()}: {' '.join(word for word, tag in subtree.leaves())}")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
