<a href="https://colab.research.google.com/github/PriyadarseenyPasayat/WE-Module3/blob/main/Markov_Text_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div class="markdown-google-sans">
  <h1>Welcome to Colab!</h1>
</div>

<!-- TODO(b/319266067) remove temporary advert after a few weeks. -->
<div class="markdown-google-sans">
  <h2>(New) Try the Gemini API</h2>
  <ul>
  <li><a href="https://makersuite.google.com/app/apikey">Generate a Gemini API key</a></li>
  <li><a href="https://colab.research.google.com/github/googlecolab/colabtools/blob/main/notebooks/Talk_to_Gemini_with_Google%27s_Speech_to_Text_API.ipynb?utm_medium=link&utm_campaign=gemini">Talk to Gemini with the Speech-to-Text API</a></li>
  <li><a href="https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/tutorials/python_quickstart.ipynb?utm_medium=link&utm_campaign=gemini">Gemini API: Quickstart with Python</a></li>
  <li><a href="https://colab.research.google.com/notebooks/snippets/gemini.ipynb?utm_medium=link&utm_campaign=gemini">Gemini API code sample</a></li>
  <li><a href="https://colab.research.google.com/github/googlecolab/colabtools/blob/main/notebooks/Learning_with_Gemini_and_ChatGPT.ipynb?utm_medium=link&utm_campaign=gemini">Compare Gemini with ChatGPT</a></li>  
  <li><a href="https://colab.google/notebooks/?utm_medium=link&utm_campaign=gemini">More notebooks</a></li>
  </ul>
</div>


<div class="markdown-google-sans">

<a name="machine-learning-examples"></a>

### Featured examples

</div>

- [NeMo Voice Swap](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/VoiceSwapSample.ipynb): Use Nvidia's NeMo conversational AI Toolkit to swap a voice in an audio fragment with a computer generated one.

- [Retraining an Image Classifier](https://tensorflow.org/hub/tutorials/tf2_image_retraining): Build a Keras model on top of a pre-trained image classifier to distinguish flowers.
- [Text Classification](https://tensorflow.org/hub/tutorials/tf2_text_classification): Classify IMDB movie reviews as either *positive* or *negative*.
- [Style Transfer](https://tensorflow.org/hub/tutorials/tf2_arbitrary_image_stylization): Use deep learning to transfer style between images.
- [Multilingual Universal Sentence Encoder Q&A](https://tensorflow.org/hub/tutorials/retrieval_with_tf_hub_universal_encoder_qa): Use a machine learning model to answer questions from the SQuAD dataset.
- [Video Interpolation](https://tensorflow.org/hub/tutorials/tweening_conv3d): Predict what happened in a video between the first and the last frame.


In [None]:
import random
from collections import defaultdict
import re

def generate_text(filename, start_word, output_length):
    # Read text from file
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()

    # Preprocess text: remove punctuation and split into words
    words = re.findall(r'\b\w+\b', text.lower())

    # Create a dictionary to store word frequencies
    word_freq = defaultdict(int)
    for i in range(len(words) - 1):
        word_freq[(words[i], words[i+1])] += 1

    # Generate text
    output = [start_word]
    current_word = start_word
    for _ in range(int(output_length) - 1):
        next_word_candidates = [word for word in words if (current_word, word) in word_freq]
        print(next_word_candidates)
        if not next_word_candidates:
            break
        next_word = random.choices(next_word_candidates, weights=[word_freq[(current_word, w)] for w in next_word_candidates])[0]
        output.append(next_word)
        current_word = next_word

    return ' '.join(output)

# Example usage:
filename = 'HISTORY.txt'
start_word = 'the'
output_length = 100
generated_text = generate_text(filename, start_word, output_length)
print(generated_text)


['idle', 'file', 'release', 'previous', 'idle', 'dark', 'idle', 'idlefork', 'release', 'revitalisation', 'idlefork', 'release', 'python', 'earlier', 'python', 'idlefork', 'release', 'earlier', 'idle', 'current', 'cvs', 'idle', 'idle', 'release', 'shell', 'window', 'debugger', 'case', 'previous', 'idle', 'release', 'release', 'point', 'idlefork', 'idlefork', 'cvs', 'two', 'idlefork', 'user', 'trunk', 'idlefork', 'release', 'first', 'first', 'release', 'idle', 'idle', 'vpython', 'idle', 'idlefork', 'cvs', 'idle', 'list', 'python', 'idle', 'idle', 'execbinding', 'module', 'standard', 'rpc', 'ondemandoutputwindow', 'idle', 'file', 'execbinding', 'module', 'user', 'traceback', 'ondemandoutputwindow', 'class', 'user', 'output', 'previous', 'window', 'idle', 'window', 'window', 'file', 'window', 'nonexistent', 'window', 'window', 'need', 'execbinding', 'copy', 'idle', 'rpc', 'command', 'copy', 'menus', 'excessively', 'menu', 'python', 'windows', 'win32api', 'module', 'python', 'idle', 'help',

In [None]:
import random
from collections import defaultdict
import re

class MarkovTextGenerator:
    def __init__(self, filename):
        self.filename = filename
        self.word_freq = defaultdict(int)
        self.words = []

    def _preprocess_text(self):
        with open(self.filename, 'r', encoding='utf-8') as file:
            text = file.read()
        self.words = re.findall(r'\b\w+\b', text.lower())

    def _compute_word_frequencies(self, chain_length):
        for i in range(len(self.words) - chain_length):
            chain = tuple(self.words[i:i+chain_length])
            next_word = self.words[i+chain_length]
            self.word_freq[(chain, next_word)] += 1

    def generate_text(self, start_words, output_length, chain_length):
        if not self.words:
            self._preprocess_text()
        if not self.word_freq:
            self._compute_word_frequencies(chain_length)

        output = list(start_words)
        current_chain = tuple(start_words)
        for _ in range(int(output_length) - len(start_words)):
            next_word_candidates = [word for word in self.words if (current_chain, word) in self.word_freq]
            if not next_word_candidates:
                break
            next_word = random.choices(next_word_candidates, weights=[self.word_freq[(current_chain, w)] for w in next_word_candidates])[0]
            output.append(next_word)
            current_chain = tuple(output[-chain_length:])

        return ' '.join(output)

# Example usage:
filename = 'HISTORY.txt'
markov_generator = MarkovTextGenerator(filename)
start_words = ['the', 'python']
output_length = 50
chain_length = 2
generated_text = markov_generator.generate_text(start_words, output_length, chain_length)
print(generated_text)
markov_generator.word_freq


the python shell window a k a interactive interpreter debugger not complete but you can use idle pyw to avoid popping up a dos console if you have problems or suggestions you should be able to run a script this runs the script in the class and path browsers tp


defaultdict(int,
            {(('idle', 'history'), 'this'): 1,
             (('history', 'this'), 'file'): 1,
             (('this', 'file'), 'contains'): 1,
             (('file', 'contains'), 'the'): 1,
             (('contains', 'the'), 'release'): 1,
             (('the', 'release'), 'messages'): 1,
             (('release', 'messages'), 'for'): 1,
             (('messages', 'for'), 'previous'): 1,
             (('for', 'previous'), 'idle'): 1,
             (('previous', 'idle'), 'releases'): 1,
             (('idle', 'releases'), 'as'): 1,
             (('releases', 'as'), 'you'): 1,
             (('as', 'you'), 'read'): 1,
             (('you', 'read'), 'on'): 1,
             (('read', 'on'), 'you'): 1,
             (('on', 'you'), 'go'): 1,
             (('you', 'go'), 'back'): 1,
             (('go', 'back'), 'to'): 1,
             (('back', 'to'), 'the'): 1,
             (('to', 'the'), 'dark'): 1,
             (('the', 'dark'), 'ages'): 1,
             (('dark', 'ages'), 'of

In [13]:
import random
from collections import defaultdict, Counter
import string

def preprocess_text(text):
    """Removes punctuation from the text."""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def build_transition_matrix(corpus, order=1):
    """Builds a transition matrix with frequency of occurrence."""
    transition_matrix = defaultdict(Counter)
    words = corpus.split()
    for i in range(len(words) - order):
        current_sequence = tuple(words[i:i+order])
        next_word = words[i + order]
        transition_matrix[current_sequence][next_word] += 1
    print(transition_matrix)
    return transition_matrix

def generate_text1(transition_matrix, start_sequence, output_length):
    """Generates text using the transition matrix."""
    current_sequence = start_sequence
    generated_text = list(current_sequence)

    for _ in range(output_length - len(start_sequence)):
        next_word_counter = transition_matrix[current_sequence]
        if next_word_counter:
            total_occurrences = sum(next_word_counter.values())
            probabilities = [count / total_occurrences for count in next_word_counter.values()]
            next_word = random.choices(list(next_word_counter.keys()), weights=probabilities)[0]
        else:
            # If transition matrix is empty, choose a random word from the entire vocabulary
            next_word = random.choice(list(transition_matrix.keys())[0])

        generated_text.append(next_word)
        current_sequence = tuple(generated_text[-len(start_sequence):])

    return ' '.join(generated_text)


# Example usage:
corpus = "The cat sat on the mat, the dog barked!"
corpus = preprocess_text(corpus)
transition_matrix = build_transition_matrix(corpus, order=2)
output_text = generate_text1(transition_matrix, ('The', 'cat'), 10)
print(output_text)


defaultdict(<class 'collections.Counter'>, {('The', 'cat'): Counter({'sat': 1}), ('cat', 'sat'): Counter({'on': 1}), ('sat', 'on'): Counter({'the': 1}), ('on', 'the'): Counter({'mat': 1}), ('the', 'mat'): Counter({'the': 1}), ('mat', 'the'): Counter({'dog': 1}), ('the', 'dog'): Counter({'barked': 1})})
[ ' T h e ' ,   ' c a t ' ,   ' s a t ' ,   ' o n ' ,   ' t h e ' ,   ' m a t ' ,   ' t h e ' ,   ' d o g ' ,   ' b a r k e d ' ,   ' c a t ' ]


In [17]:
import random
from collections import defaultdict, Counter
import string

def preprocess_text(text):
    """Removes punctuation from the text."""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def build_transition_matrix(corpus, order=1):
    """Builds a transition matrix with frequency of occurrence."""
    transition_matrix = defaultdict(Counter)
    words = corpus.split()
    for i in range(len(words) - order):
        current_sequence = tuple(words[i:i+order])
        next_word = words[i + order]
        transition_matrix[current_sequence][next_word] += 1
    print(transition_matrix)
    return transition_matrix

def generate_text(transition_matrix, start_sequence, output_length):
    """Generates text using the transition matrix."""
    current_sequence = start_sequence
    generated_text = list(current_sequence)

    for _ in range(output_length - len(start_sequence)):
        next_word_counter = transition_matrix[current_sequence]
        if next_word_counter:
            total_occurrences = sum(next_word_counter.values())
            probabilities = [count / total_occurrences for count in next_word_counter.values()]
            next_word = random.choices(list(next_word_counter.keys()), weights=probabilities)[0]
        else:
            # If transition matrix is empty, choose a random word from the entire vocabulary
            next_word = random.choice(list(transition_matrix.keys())[0])

        generated_text.append(next_word)
        current_sequence = tuple(generated_text[-len(start_sequence):])

    print(generated_text)
    return ' '.join(generated_text)


corpus = "The cat sat on the mat, the dog barked!"
corpus = preprocess_text(corpus)
transition_matrix = build_transition_matrix(corpus, order=2)
output_text = generate_text(transition_matrix, ('The', 'cat'), 10)
print(output_text)

defaultdict(<class 'collections.Counter'>, {('The', 'cat'): Counter({'sat': 1}), ('cat', 'sat'): Counter({'on': 1}), ('sat', 'on'): Counter({'the': 1}), ('on', 'the'): Counter({'mat': 1}), ('the', 'mat'): Counter({'the': 1}), ('mat', 'the'): Counter({'dog': 1}), ('the', 'dog'): Counter({'barked': 1})})
['The', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'dog', 'barked', 'The']
The cat sat on the mat the dog barked The


In [20]:
# Test case 1: Basic test with a small corpus
corpus1 = "The cat sat on the mat, the dog barked!"
corpus1 = preprocess_text(corpus1)
transition_matrix1 = build_transition_matrix(corpus1, order=2)
output_text1 = generate_text(transition_matrix1, ('The', 'cat'), 10)
print("Test case 1 output:", output_text1)


defaultdict(<class 'collections.Counter'>, {('The', 'cat'): Counter({'sat': 1}), ('cat', 'sat'): Counter({'on': 1}), ('sat', 'on'): Counter({'the': 1}), ('on', 'the'): Counter({'mat': 1}), ('the', 'mat'): Counter({'the': 1}), ('mat', 'the'): Counter({'dog': 1}), ('the', 'dog'): Counter({'barked': 1})})
['The', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'dog', 'barked', 'cat']
Test case 1 output: The cat sat on the mat the dog barked cat


In [19]:
# Test case 2: Test with a larger corpus and higher order transitions
corpus2 = "I am Groot. I am Groot. I am Groot."
corpus2 = preprocess_text(corpus2)
transition_matrix2 = build_transition_matrix(corpus2, order=2)
output_text2 = generate_text(transition_matrix2, ('I', 'am'), 10)
print("Test case 2 output:", output_text2)



defaultdict(<class 'collections.Counter'>, {('I', 'am'): Counter({'Groot': 3}), ('am', 'Groot'): Counter({'I': 2}), ('Groot', 'I'): Counter({'am': 2})})
['I', 'am', 'Groot', 'I', 'am', 'Groot', 'I', 'am', 'Groot', 'I']
Test case 2 output: I am Groot I am Groot I am Groot I


In [22]:
# Test case 3: Test with a corpus containing special characters
corpus3 = "Hello, world! This is a test string.Hello, world! This is a test string.Hello,Hello, world! This is a test string. world! This is a test string.Hello, world! This is a test string.Hello, world! This is a test string."
corpus3 = preprocess_text(corpus3)
transition_matrix3 = build_transition_matrix(corpus3, order=1)
output_text3 = generate_text(transition_matrix3, ('Hello',), 10)
print("Test case 3 output:", output_text3)

defaultdict(<class 'collections.Counter'>, {('Hello',): Counter({'world': 1}), ('world',): Counter({'This': 6}), ('This',): Counter({'is': 6}), ('is',): Counter({'a': 6}), ('a',): Counter({'test': 6}), ('test',): Counter({'stringHello': 3, 'string': 2, 'stringHelloHello': 1}), ('stringHello',): Counter({'world': 3}), ('stringHelloHello',): Counter({'world': 1}), ('string',): Counter({'world': 1})})
['Hello', 'world', 'This', 'is', 'a', 'test', 'string', 'world', 'This', 'is']
Test case 3 output: Hello world This is a test string world This is
