# Chlengmo Demo

## Character-Level N-Gram Model

In [1]:
# bloody dependencies
# NOTE: The package itself needs no dependencies.
#       All of these imports are used in the notebook,
#       mostly for downloading & manipulating data.
import nltk

nltk.download("brown")
nltk.download("gutenberg")
from nltk.corpus import brown, gutenberg
from nltk.lm.models import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
import pandas as pd
import time

# chlengmo
from chlengmo import Chlengmo

[nltk_data] Downloading package brown to /Users/victor/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## Sample Usage: Moby Dick

In [2]:
# retrieve corpus from NLTK
filename = "melville-moby_dick.txt"
text = gutenberg.raw(filename)
start = "Call me Ishmael"
start_idx = text.index(start)
text = text[start_idx:]
print("\nReal text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 15
model = Chlengmo(n=n).fit(text)

# generate fake text
length = 981
prompt = "Call me "
seed = 42
fake_text = model.generate(length=length, prompt=prompt, seed=seed)
print("\nFake text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
tak

## Sample Usage: Shakespeare

In [3]:
# retrieve corpus from NLTK
filename = "shakespeare-hamlet.txt"
text = gutenberg.raw(filename)
print("\nReal text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 15
model = Chlengmo(n=n).fit(text)

# generate fake text
length = 999
seed = 42
fake_text = model.generate(length=length, seed=seed)
print("\nFake text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your selfe

   Bar. Long liue the King

   Fran. Barnardo?
  Bar. He

   Fran. You come most carefully vpon your houre

   Bar. 'Tis now strook twelue, get thee to bed Francisco

   Fran. For this releefe much thankes: 'Tis bitter cold,
And I am sicke at heart

   Barn. Haue you had quiet Guard?
  Fran. Not a Mouse stirring

   Barn. Well, goodnight. If you do meet Horatio and
Marcellus, the Riuals of my Watch, bid them make hast.
Enter Horatio and Marcellus.

  Fran. I thinke I heare them. Stand: who's there?
  Hor. Friends to this ground

   Mar. And Leige-men to the Dane

   Fran. Giue you good night

   Mar. O farwel honest Soldier, who hath relieu'd you?
  Fra. Barnardo ha's my place: giue you goodnight.

Exit Fran.

  

## Sample Usage: King James Bible

In [4]:
# retrieve corpus from NLTK
filename = "bible-kjv.txt"
text = gutenberg.raw(filename)
start = "1:1 In the beginning "
start_idx = text.index(start)
text = text[start_idx:]
print("\nReal text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 15
model = Chlengmo(n=n).fit(text)

# generate fake text
length = 999
prompt = "1:1 In the beginning "
seed = 42
fake_text = model.generate(length=length, prompt=prompt, seed=seed)
print("\nFake text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real text >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the morning were the first day.

1:6 And God said, Let there be a firmament in the midst of the waters,
and let it divide the waters from the waters.

1:7 And God made the firmament, and divided the waters which were
under the firmament from the waters which were above the firmament:
and it was so.

1:8 And God called the firmament Heaven. And the evening and the
morning were the second day.

1:9 And God said, Let the waters under the heaven be gathered together
unto one pla

## Sample Usage: Trump Tweets

In [5]:
# download data
filename = "https://raw.githubusercontent.com/ecdedios/into-heart-of-darkness/master/trump_20200530.csv"
df = pd.read_csv(filename)
text = "\n\n".join(df["text"].values)
print("\nReal tweets >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 15
model = Chlengmo(n=n).fit(text)

# generate fake tweets
length = 999
prompt = "Hillary Clinton is "
seed = 42
fake_text = model.generate(length=length, prompt=prompt, seed=seed)
print("\nFake tweets >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real tweets >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
RT @ScottAdamsSays: Malaria drug and zinc the missing link https://t.co/FC9CpuH0Yr via @theconwom

RT @YoungDems4Trump: In Democrat cities you can get arrested for opening a business but not for looting one. #MinneapolisRiot #Coronavirus

RT @YoungDems4Trump: So sad. This poor business owner lost everything due to the  #MinneapolisRiothttps://t.co/mK0nHFNthS

Time for a change! #2020 https://t.co/AECy2GBfys

RT @TallahForTrump: Trump spoke at my church in Detroit and it opened my eyes. Never again will I be a slave to the Democrats!👊 Let us rise…

RT @TheRightMelissa: In an ironic twist of fate CNN HQ is being attacked by the very riots they promoted as noble &amp; just. Oops

RT @Jim_Jordan: Right on! We don’t have to pay organizations to lie to us. They’ll probably do it for free.

RT @Scavino45: “Texas AG Ken Paxton: Trump is right and Twitter ‘fact check’ is wrong – mail-in ballot fraud is a real problem” h

## Sample Usage: Prime Numbers

In [6]:
# sieve of erotasthenes
maxprime = 1000000
isprime = [True] * maxprime
isprime[0] = False
isprime[1] = False
for factor in range(2, 11):
    multiple = 2 * factor
    while multiple < maxprime:
        isprime[multiple] = False
        multiple += factor
primes = [number for number, flag in enumerate(isprime) if flag]
text = " ".join([str(prime) for prime in primes])
print("\nReal primes >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 4
model = Chlengmo(n=n).fit(text)

# generate fake primes
length = 999
prompt = "2 3 5 7 11 13 "
seed = 42
fake_text = model.generate(length=length, prompt=prompt, seed=seed)
print("\nFake primes >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real primes >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97 101 103 107 109 113 121 127 131 137 139 143 149 151 157 163 167 169 173 179 181 187 191 193 197 199 209 211 221 223 227 229 233 239 241 247 251 253 257 263 269 271 277 281 283 289 293 299 307 311 313 317 319 323 331 337 341 347 349 353 359 361 367 373 377 379 383 389 391 397 401 403 407 409 419 421 431 433 437 439 443 449 451 457 461 463 467 473 479 481 487 491 493 499 503 509 517 521 523 527 529 533 541 547 551 557 559 563 569 571 577 583 587 589 593 599 601 607 611 613 617 619 629 631 641 643 647 649 653 659 661 667 671 673 677 683 689 691 697 701 703 709 713 719 727 731 733 737 739 743 751 757 761 767 769 773 779 781 787 793 797 799 803 809 811 817 821 823 827 829 839 841 851 853 857 859 863 869 871 877 881 883 887 893 899 901 907 911 913 919 923 929 937 941 943 947 949 953 961 967 971 977 979 983 989 991 997 1003 1007 1009 1013 1019 1021

## Sample Usage: News Stories

In [7]:
# retrieve corpus from NLTK
words = brown.words(categories="news")
text = " ".join(words)
print("\nReal news >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(text[:999])

# create and fit model
n = 15
model = Chlengmo(n=n).fit(text)

# generate fake text
length = 999
prompt = "The Fulton County Grand Jury said Friday "
seed = 42
fake_text = model.generate(length=length, prompt=prompt, seed=seed)
print("\nFake news >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(fake_text)


Real news >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and oft

## Speed Test!

In [8]:
# retrieve corpus from NLTK
filename = "melville-moby_dick.txt"
text = gutenberg.raw(filename)
start = "Call me Ishmael"
start_idx = text.index(start)
text = text[start_idx:]

# fit chlengmo model
n = 3
chlengmo = Chlengmo(n=n)
time_start = time.process_time()
chlengmo.fit(text)
time_end = time.process_time()
time_elapsed_chlengmo = time_end - time_start
print(f"Fit Chlengmo model in {time_elapsed_chlengmo:.2f}s")

# fit nltk model
# REF: https://www.nltk.org/api/nltk.lm.html
train, vocab = padded_everygram_pipeline(n, [list(text)])
mle = MLE(n)
time_start = time.process_time()
mle.fit(train, vocab)
time_end = time.process_time()
time_elapsed_nltk = time_end - time_start
print(f"Fit NLTK model in {time_elapsed_nltk:.2f}s")

# chlengmo is >10x faster!
time_factor = time_elapsed_nltk / time_elapsed_chlengmo
print(f"Fit Chlengmo model {time_factor:.0f}x faster than NLTK model.")

# generate fake text from chlengmo model
length = 9999
prompt = "Call me "
time_start = time.process_time()
fake_text_chlengmo = chlengmo.generate(length=length, prompt=prompt)
time_end = time.process_time()
time_elapsed_chlengmo = time_end - time_start
print(f"Generated text from Chlengmo model in {time_elapsed_chlengmo:.2f}s")

# generate fake text from nltk model
time_start = time.process_time()
fake_text_nltk = mle.generate(num_words=length, text_seed=list(prompt))
time_end = time.process_time()
time_elapsed_nltk = time_end - time_start
fake_text_nltk = "".join(fake_text_nltk)
print(f"Generated text from NLTK model in {time_elapsed_nltk:.2f}s")

# chlengmo is >10x faster!
time_factor = time_elapsed_nltk / time_elapsed_chlengmo
print(f"Generated text from Chlengmo model {time_factor:.0f}x faster than NLTK model.")

Fit Chlengmo model in 0.29s
Fit NLTK model in 8.23s
Fit Chlengmo model 28x faster than NLTK model.
Generated text from Chlengmo model in 0.03s
Generated text from NLTK model in 1.00s
Generated text from Chlengmo model 37x faster than NLTK model.
