In [1]:
#installing the required libraries
!pip install nltk
!pip install spacy
!pip install markovify
!pip install -m spacy download en

Collecting markovify
  Downloading markovify-0.9.4.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting unidecode (from markovify)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: markovify
  Building wheel for markovify (setup.py) ... [?25ldone
[?25h  Created wheel for markovify: filename=markovify-0.9.4-py3-none-any.whl size=18606 sha256=2389a6771f7dca201dd2884da64c6f63c6763f6e8e755c5d63b7c39da49b81c6
  Stored in directory: /root/.cache/pip/wheels/ca/8c/c5/41413e24c484f883a100c63ca7b3b0362b7c6f6eb6d7c9cc7f
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.3.8

Usage:   
  pip install [options] <requirement specifier> [package-

## **Loading Data:**

In [3]:
import pandas as pd
import re
import markovify
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

# Connect to the SQLite database
conn = sqlite3.connect('/kaggle/input/wikibooks-dataset/wikibooks.sqlite')


# Load the data into a pandas dataframe
df = pd.read_sql_query("SELECT * FROM en LIMIT 1000", conn)

# Concatenate 'body_text' entries into a single string
data = ' '.join(df['body_text'].dropna())

# Function for text cleaning
def text_cleaner(text):
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation references like [1], [2], etc.
    text = re.sub(r'\n', ' ', text)      # Replace newline characters with spaces
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    return text.strip()                  # Strip leading and trailing whitespace

# Clean the dataset
cleaned_data = text_cleaner(data)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## **Tokenization and Model Building:**

In [4]:
# Tokenize the cleaned data into words
tokens = word_tokenize(cleaned_data)

# Function to build Markov chain model
def build_markov_chain(tokens, n=2):
    markov_chain = {}
    for i in range(len(tokens)-n):
        key = ' '.join(tokens[i:i+n])
        value = tokens[i+n]
        if key in markov_chain:
            markov_chain[key].append(value)
        else:
            markov_chain[key] = [value]
    return markov_chain

# Build the Markov chain model
markov_chain = build_markov_chain(tokens, n=2)


## **Generating Text:**

In [7]:
import random
# Function to generate text using Markov chain
def generate_text(markov_chain, seed, length=100):
    current = seed.split()  # Initial seed
    text = current.copy()

    while len(text) < length:
        key = ' '.join(current[-len(seed.split()):])
        if key in markov_chain:
            next_word = random.choice(markov_chain[key])
            text.append(next_word)
            current.append(next_word)
        else:
            break

    return ' '.join(text)

# Seed for text generation
seed = "machine learning"

# Generate text using the Markov chain model
generated_text = generate_text(markov_chain, seed, length=50)
print("Generated Text:")
print(generated_text)


Generated Text:
machine learning algorithms for public-key encryption . Read a book , worked to prevent workplace harassment cases reported to the room . Overhead projectors are becoming outdated and contain links to more wide use by consumers in the Moslem World today . This book is on the shelf materials and


In [6]:
import spacy
# Load SpaCy English model
nlp = spacy.load('en_core_web_sm')

# Define a subclass of markovify.Text that uses SpaCy for part-of-speech tagging
class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ['::'.join((word.orth_, word.pos_)) for word in nlp(sentence)]
    
    def word_join(self, words):
        sentence = ' '.join(word.split('::')[0] for word in words)
        return sentence

# Create a POSifiedText instance with state_size=3
generator_wikibooks = POSifiedText(cleaned_data, state_size=3)

# Generate sentences using the new generator
print("Generated Sentences:")
for i in range(5):
    print(generator_wikibooks.make_sentence())

print("\nGenerated Short Sentences (max 100 characters):")
for i in range(5):
    print(generator_wikibooks.make_short_sentence(max_chars=100))

Generated Sentences:
What types of information will best illuminate what you are left with is the verb 's i - form  puli .
?  is used to brew a potion that is used in this book mention could not be available , which means JMOOC is for self - directed learning .
A weapon is defined by similarities with one or more operating systems together allowing for communication between the islands in the Pacific .
The next phase involved the task to establish a chain of communications on our behalf through the United States was not engaged in any major war .
The simplest smudging tool is the finger , although care should be used in each successive time the for loop is run through .

Generated Short Sentences (max 100 characters):
My real job is to take a specific course of action , and vice versa .
They are much smaller than the prior .
The foundation on which the Imperial post office will be represented by a hollow dot .
The assembly language code was then fed into the maw of combat .
For example