In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
"""
import numpy as np: Imports the NumPy library for numerical computations.
import string: Imports the string module for string operations.
np.random.seed(1234): This line sets the random seed for NumPy's random number generator.
Setting the seed ensures that the random numbers generated are reproducible.
"""
import numpy as np
import string

np.random.seed(1234)

In [17]:
"""
initial = {}: Creates an empty dictionary to store the frequency of words appearing at the beginning of a phrase.
first_order = {}: Creates an empty dictionary to store the frequency of the second word given the first word.
second_order = {}: Creates an empty dictionary to store the frequency of the third word given the first two words.
"""
initial = {} # start of a phrase
first_order = {} # second word only
second_order = {}
third_order = {}

In [4]:
"""
This block defines a function called remove_punctuation that takes a string s as input.
It removes punctuation from the input string using the translate method and returns the modified string.
"""
def remove_punctuation(s):
    return s.translate(str.maketrans('','',string.punctuation))

In [5]:
"""
This block defines a function called add2dict which takes a dictionary d, a key k, and a value v as input.
If the key k is not already in the dictionary d, it creates a new list for that key.
It then appends the value v to the list associated with the key k in the dictionary d.
"""
def add2dict(d, k, v):
  if k not in d:
    d[k] = []
  d[k].append(v)

# [cat, cat, dog, dog, dog, dog, dog, mouse, ...]

In [6]:
"""
This block reads a text file line by line.
For each line, it removes punctuation, converts it to lowercase, and splits it into individual words (tokens).
It then iterates through the tokens to build the dictionaries initial, first_order, and second_order.
"""
for line in open('/content/drive/MyDrive/Colab Notebooks2/NLP/Lab5/robert_frost (1).txt'):
  tokens = remove_punctuation(line.rstrip().lower()).split()

  T = len(tokens)
  for i in range(T):
    t = tokens[i]
    if i == 0:
      # measure the distribution of the first word
      initial[t] = initial.get(t, 0.) + 1
    else:
      t_1 = tokens[i-1]
      if i == T - 1:
        # measure probability of ending the line
        add2dict(second_order, (t_1, t), 'END')
      if i == 1:
        # measure distribution of second word
        # given only first word
        add2dict(first_order, t_1, t)
      else:
        t_2 = tokens[i-2]
        add2dict(second_order, (t_2, t_1), t)

In [7]:
"""
These blocks normalize the frequency counts in the initial, first_order, and second_order dictionaries to obtain probabilities.
They convert the lists of words into dictionaries of probabilities, where each word is associated with its probability of occurrence.
"""
# normalize the distributions
initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [8]:
"""
sample_word function: This function takes a dictionary of probabilities d as input.
It randomly selects a word from the dictionary based on the probabilities assigned to each word.
generate function: This function generates text by sampling words based on the probabilities learned from the input text file.
It uses the sample_word function to select words and build sentences.
"""
# convert [cat, cat, cat, dog, dog, dog, dog, mouse, ...]
# into {cat: 0.5, dog: 0.4, mouse: 0.1}

def list2pdict(ts):
  # turn each list of possibilities into a dictionary of probabilities
  d = {}
  n = len(ts)
  for t in ts:
    d[t] = d.get(t, 0.) + 1
  for t, c in d.items():
    d[t] = c / n
  return d

In [9]:

for t_1, ts in first_order.items():
  # replace list with dictionary of probabilities
  first_order[t_1] = list2pdict(ts)

In [10]:
for k, ts in second_order.items():
  second_order[k] = list2pdict(ts)

In [11]:
def sample_word(d):
  # print "d:", d
  p0 = np.random.random()
  # print "p0:", p0
  cumulative = 0
  for t, p in d.items():
    cumulative += p
    if p0 < cumulative:
      return t
  assert(False) # should never get here

In [12]:
def generate():
  for i in range(4): # generate 4 lines
    sentence = []

    # initial word
    w0 = sample_word(initial)
    sentence.append(w0)

    # sample second word
    w1 = sample_word(first_order[w0])
    sentence.append(w1)

    # second-order transitions until END
    while True:
      w2 = sample_word(second_order[(w0, w1)])
      if w2 == 'END':
        break
      sentence.append(w2)
      w0 = w1
      w1 = w2
    print(' '.join(sentence))

In [15]:
"""
generate() function to start the text generation process.
"""
generate()

i know
up to pass a winter eve
to make them out
and then someone


Purpose and Importance:

This code implements a statistical language model, specifically a second-order Markov model, for text generation. Its core function is to learn patterns from an input text and then use these patterns to generate new text that resembles the original text in style and structure.

Why It's Required:

Learning Language Patterns: The code analyzes the input text to understand the frequency and sequence of words. It builds probability distributions for words appearing at the beginning of sentences, following specific words, and following pairs of words. This learning process captures the underlying language patterns present in the text.

Generating Realistic Text: Using the learned probabilities, the code can generate new text by sampling words based on their likelihood of occurrence in a given context. This allows it to create sentences that are grammatically plausible and stylistically similar to the input text.

Applications: Statistical language models like this have various applications, including:

Text prediction: Suggesting the next word as you type, like in smartphone keyboards.
Machine translation: Helping to translate text from one language to another by considering word sequences and probabilities.
Speech recognition: Converting spoken language into text by predicting the most likely words based on acoustic signals.
Text summarization: Creating condensed versions of text by identifying and extracting the most important information.
Chatbots: Building conversational agents that can generate more human-like and contextually relevant responses.
Usage:

The code takes a text file as input, which is used to train the language model. After training, it can generate new text by:

Randomly selecting a starting word based on the probabilities in the initial dictionary.
Choosing the next word based on the probabilities in the first_order dictionary, given the previous word.
Continuing to generate words based on the probabilities in the second_order dictionary, given the previous two words.
The process ends when the model generates the 'END' token, indicating the end of a sentence.
Importance:

Statistical language models are important tools for understanding and working with natural language data. They provide a way to capture the statistical regularities of language, allowing us to perform tasks such as text generation, prediction, and analysis. By learning from data, these models can help us to build systems that can understand and interact with human language in a more intelligent and meaningful way.

In the context of your provided code, it demonstrates a simple yet powerful technique for text generation. By learning from the style and structure of Robert Frost's poems, the model can generate new text that resembles his writing. While the generated text might not be perfect poetry, it shows the potential of statistical language models to learn and mimic the patterns of human language.

In [13]:
#increase the order  for our first sem project report


In [18]:
for line in open('/content/drive/MyDrive/Colab Notebooks2/NLP/Lab5/robert_frost (1).txt'):
  tokens = remove_punctuation(line.rstrip().lower()).split()

  T = len(tokens)
  for i in range(T):
    t = tokens[i]
    if i == 0:
      # measure the distribution of the first word
      initial[t] = initial.get(t, 0.) + 1
    else:
      t_1 = tokens[i-1]
      if i == T - 1:
        # measure probability of ending the line
        add2dict(third_order, (tokens[i-2] if i>1 else None,t_1, t), 'END') #Update here for third-order
      if i == 1:
        # measure distribution of second word
        # given only first word
        add2dict(first_order, t_1, t)
      elif i==2:
        t_2 = tokens[i-2]
        add2dict(second_order, (t_2, t_1), t)
      else:
        t_2 = tokens[i-2]
        t_3 = tokens[i-3]
        add2dict(third_order, (t_3,t_2, t_1), t)  #Update here for third-order

In [19]:
for k, ts in third_order.items():
  third_order[k] = list2pdict(ts)

In [20]:
def generate():
  for i in range(4): # generate 4 lines
    sentence = []

    # initial word
    w0 = sample_word(initial)
    sentence.append(w0)

    # sample second word
    w1 = sample_word(first_order[w0])
    sentence.append(w1)

    # sample third word
    if (None,w0,w1) in third_order:
      w2 = sample_word(third_order[(None,w0,w1)]) #Update here for third-order
    else:
      w2 = sample_word(second_order[(w0, w1)])
    sentence.append(w2)

    # third-order transitions until END

    while True:
      if (w0,w1,w2) in third_order:
        w3 = sample_word(third_order[(w0, w1,w2)]) #Update here for third-order
      else:
        w3 = sample_word(second_order[(w1,w2)])

      if w3 == 'END':
        break
      sentence.append(w3)
      w0 = w1
      w1 = w2
      w2 = w3
    print(' '.join(sentence))