In [13]:
import nltk
from nltk.book import FreqDist
from nltk.corpus import names, stopwords

import random
from typing import Optional

In [32]:
# get stopwords list
stop_words = stopwords.words('english')
# open text file
with open('warlordofmars.txt', 'r') as f:
    text = f.read()

words_list = [w for w in text.split()]


In [33]:
def filter_stopwords(words_list: list, stop_words: list):
    """
    Return list of words after removing stopwords
    """
    return [w for w in words_list if not w.lower() in stop_words]

In [34]:
def freq_words_top50(words_list: list, stop_words: list):
    """
    Return 50 most frequently occuring words words without stopwords.
    """
    # return filtered text
    words_list = filter_stopwords(words_list, stop_words)
    # create freqency distribution
    fd = FreqDist(words_list)
    # get only words
    fd_words = list(fd.keys())

    return fd_words[:50]

filter_stopwords(words_list, stop_words)

['Warlord',
 'Mars',
 'Edgar',
 'Rice',
 'Burroughs',
 'CONTENTS',
 'River',
 'Iss',
 'Mountains',
 'Temple',
 'Sun',
 'Secret',
 'Tower',
 'Kaolian',
 'Road',
 'Hero',
 'Kaol',
 'New',
 'Allies',
 'Carrion',
 'Caves',
 'Yellow',
 'Men',
 'Durance',
 'Pity',
 'Plenty',
 '"Follow',
 'Rope!"',
 'Magnet',
 'Switch',
 'Tide',
 'Battle',
 'Rewards',
 'New',
 'Ruler',
 'RIVER',
 'ISS',
 'shadows',
 'forest',
 'flanks',
 'crimson',
 'plain',
 'side',
 'Lost',
 'Sea',
 'Korus',
 'Valley',
 'Dor,',
 'beneath',
 'hurtling',
 'moons',
 'Mars,',
 'speeding',
 'meteoric',
 'way',
 'close',
 'bosom',
 'dying',
 'planet,',
 'crept',
 'stealthily',
 'along',
 'trail',
 'shadowy',
 'form',
 'hugged',
 'darker',
 'places',
 'persistency',
 'proclaimed',
 'sinister',
 'nature',
 'errand.',
 'six',
 'long',
 'Martian',
 'months',
 'haunted',
 'vicinity',
 'hateful',
 'Temple',
 'Sun,',
 'within',
 'whose',
 'slow-revolving',
 'shaft,',
 'far',
 'beneath',
 'surface',
 'Mars,',
 'princess',
 'lay',
 'entom

In [35]:
def freq_bigram_top50(words_list: list, stop_words: list):
    """
    Return the 50 most frequent bigrams
    """
    # return filtered text
    words_list = filter_stopwords(words_list, stop_words)
    # get list of bigrams
    bigrams_list = nltk.bigrams(words_list)
    # get freq dist
    fd = FreqDist(bigrams_list)
    # get list of bigram pairs
    fd_bigrams = list(fd.keys())

    return fd_bigrams[:50]

freq_bigram_top50(words_list, stop_words)

[('Warlord', 'Mars'),
 ('Mars', 'Edgar'),
 ('Edgar', 'Rice'),
 ('Rice', 'Burroughs'),
 ('Burroughs', 'CONTENTS'),
 ('CONTENTS', 'River'),
 ('River', 'Iss'),
 ('Iss', 'Mountains'),
 ('Mountains', 'Temple'),
 ('Temple', 'Sun'),
 ('Sun', 'Secret'),
 ('Secret', 'Tower'),
 ('Tower', 'Kaolian'),
 ('Kaolian', 'Road'),
 ('Road', 'Hero'),
 ('Hero', 'Kaol'),
 ('Kaol', 'New'),
 ('New', 'Allies'),
 ('Allies', 'Carrion'),
 ('Carrion', 'Caves'),
 ('Caves', 'Yellow'),
 ('Yellow', 'Men'),
 ('Men', 'Durance'),
 ('Durance', 'Pity'),
 ('Pity', 'Plenty'),
 ('Plenty', '"Follow'),
 ('"Follow', 'Rope!"'),
 ('Rope!"', 'Magnet'),
 ('Magnet', 'Switch'),
 ('Switch', 'Tide'),
 ('Tide', 'Battle'),
 ('Battle', 'Rewards'),
 ('Rewards', 'New'),
 ('New', 'Ruler'),
 ('Ruler', 'RIVER'),
 ('RIVER', 'ISS'),
 ('ISS', 'shadows'),
 ('shadows', 'forest'),
 ('forest', 'flanks'),
 ('flanks', 'crimson'),
 ('crimson', 'plain'),
 ('plain', 'side'),
 ('side', 'Lost'),
 ('Lost', 'Sea'),
 ('Sea', 'Korus'),
 ('Korus', 'Valley'),
 ('Va

In [36]:
def random_word_n_freq(n: int, words_list: list, stop_words: Optional[list] = None):
    """
    Return a randomly selected word from the 'n' most likely words.
    """
    # remove stopwords if users assign a list of stopwords
    if stop_words:
        words_list = filter_stopwords(words_list, stop_words)

    # get frequency distribution
    fd = FreqDist(words_list)
    # get list of the n most likely words
    fd_top_n = list(fd.keys())[:n]
    # get a randomly selected word
    rand_word = random.choice(fd_top_n)

    return rand_word

random_word_n_freq(1, words_list, stop_words)

'Warlord'

In [38]:
def generate_model(cfd, word, num=15):
    seq_words = ''
    for i in range(num):
        seq_words += word + " "
        word = cfd[word].max()

    return seq_words
#  create bigrams list
bigrams = nltk.bigrams(words_list)
#  create conditional drequency distribution
cfd = nltk.ConditionalFreqDist(bigrams)
#  define a list of start words, which are randomly selected from the 100 most freq words.
num_start_words = 10
start_words_list = [random_word_n_freq(100, words_list, stop_words) for _ in range(num_start_words)]

# generate random texts 
for start in start_words_list:
    print(generate_model(cfd, start))

Road A moment later I had been a moment later I had been a moment 
side of the great switch that I had been a moment later I had been 
Rewards The yellow men of the great switch that I had been a moment later 
bosom of the great switch that I had been a moment later I had been 
dead sea bottoms of the great switch that I had been a moment later I 
along the great switch that I had been a moment later I had been a 
Mountains The yellow men of the great switch that I had been a moment later 
Martian year. As I had been a moment later I had been a moment later 
Durance The yellow men of the great switch that I had been a moment later 
Martian year. As I had been a moment later I had been a moment later 
