# Lesson Notebook
## NLP: A Closer Look at Markov Chains and Entropy 

Written by Shannon Binghan

### Notebook Description
This notebook contains code to build a Markov prediction machine with order 1.  It was used as a demonstration during the lesson.  

## Set up environment.

In [1]:
# Import packages and modules.
import numpy  as np
import pandas as pd
import string

import warnings
warnings.filterwarnings('ignore')

# Increase number of columns that can be viewed in notebook.
pd.set_option('display.max_columns', 500)

np.random.seed(42)

## Load data.

In [2]:
# Load text data.
with open('./data/greeneggs.txt', 'r') as myfile:
    data=myfile.read().replace('\n', '')
    
# Take a look.
data

'I AM SAM. I AM SAM. SAM I AM. THAT SAM-I-AM! THAT SAM-I-AM! I DO NOT LIKE THAT SAM-I-AM! DO WOULD YOU LIKE GREEN EGGS AND HAM? I DO NOT LIKE THEM, SAM-I-AM. I DO NOT LIKE GREEN EGGS AND HAM. WOULD YOU LIKE THEM HERE OR THERE? I WOULD NOT LIKE THEM HERE OR THERE. I WOULD NOT LIKE THEM ANYWHERE. I DO NOT LIKE GREEN EGGS AND HAM. I DO NOT LIKE THEM, SAM-I-AM. WOULD YOU LIKE THEM IN A HOUSE?WOULD YOU LIKE THEN WITH A MOUSE? I DO NOT LIKE THEM IN A HOUSE. I DO NOT LIKE THEM WITH A MOUSE. I DO NOT LIKE THEM HERE OR THERE. I DO NOT LIKE THEM ANYWHERE. I DO NOT LIKE GREEN EGGS AND HAM. I DO NOT LIKE THEM, SAM-I-AM. WOULD YOU EAT THEM IN A BOX? WOULD YOU EAT THEM WITH A FOX? NOT IN A BOX. NOT WITH A FOX. NOT IN A HOUSE. NOT WITH A MOUSE. I WOULD NOT EAT THEM HERE OR THERE. I WOULD NOT EAT THEM ANYWHERE. I WOULD NOT EAT GREEN EGGS AND HAM. I DO NOT LIKE THEM, SAM-I-AM. WOULD YOU? COULD YOU? IN A CAR? EAT THEM! EAT THEM! HERE THEY ARE. I WOULD NOT, COULD NOT, IN A CAR. YOU MAY LIKE THEM. YOU WIL

## Clean data.

In [3]:
# Set alphabet to include space and all lower case letters.
alphabet = list(" " + string.ascii_lowercase)

# Take a look.
alphabet[0:5]

[' ', 'a', 'b', 'c', 'd']

In [4]:
# Strip out all characters that are not letters or spaces.
stripped_data = ''

# Loop through the data.
for c in data:
    if c.lower() in alphabet:
        stripped_data += c.lower()
        
# Take a look.
stripped_data

'i am sam i am sam sam i am that samiam that samiam i do not like that samiam do would you like green eggs and ham i do not like them samiam i do not like green eggs and ham would you like them here or there i would not like them here or there i would not like them anywhere i do not like green eggs and ham i do not like them samiam would you like them in a housewould you like then with a mouse i do not like them in a house i do not like them with a mouse i do not like them here or there i do not like them anywhere i do not like green eggs and ham i do not like them samiam would you eat them in a box would you eat them with a fox not in a box not with a fox not in a house not with a mouse i would not eat them here or there i would not eat them anywhere i would not eat green eggs and ham i do not like them samiam would you could you in a car eat them eat them here they are i would not could not in a car you may like them you will see you may like them in a tree i would not could not in a

In [5]:
len(stripped_data)

3177

## Calculate the bi-gram probabilities.

In [6]:
# Create a dataframe to contain bi-grams counts (where (row, col) is incremented for each character pair (i, j)) .
df = pd.DataFrame(0, index=alphabet, columns=alphabet)

# Take a look.
df.head()

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
c,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# Update the bi-gram counts.

# Loop through the data.
for i in range(0,len(stripped_data) - 1):
    df.loc[stripped_data[i], stripped_data[i+1]] += 1
                     
# Take a look.
df.head()

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
,0,96,14,21,44,34,7,17,31,111,0,0,48,15,82,14,0,0,4,33,106,0,0,62,0,34,0
a,56,0,0,0,0,0,0,0,0,12,0,0,0,46,36,0,0,0,16,0,33,0,0,0,0,9,0
b,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0
c,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0
d,69,7,0,0,0,0,0,0,0,0,0,0,0,0,0,37,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Convert counts to probabilities, where P(i, j) = count(i, j) / total(i)

# Loop through the df rows (i).
for i in alphabet:
    counts = df.loc[i].tolist()
    total = sum(counts)
    
    # Loop through the columns by index.
    for n in range(0, len(alphabet)):
        
        # Get column name.
        j = alphabet[n]

        # Update the entry.
        if total > 0:
            df.loc[i, j] = (counts[n] / total)
        else:
            df.loc[i, j] = 0
            
# Take a look.
df.head()

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
,0.0,0.124191,0.018111,0.027167,0.056921,0.043984,0.009056,0.021992,0.040103,0.143596,0.0,0.0,0.062096,0.019405,0.10608,0.018111,0.0,0.0,0.005175,0.042691,0.137128,0.0,0.0,0.080207,0.0,0.043984,0.0
a,0.269231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057692,0.0,0.0,0.0,0.221154,0.173077,0.0,0.0,0.0,0.076923,0.0,0.158654,0.0,0.0,0.0,0.0,0.043269,0.0
b,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d,0.610619,0.061947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Build words.

In [9]:
# Identify characters that did not appear in the data.
df[df.sum(axis=1) == 0]

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
j,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
v,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Save non-appearing character(s) in a list.
missing = list(df[df.sum(axis=1) == 0].index)

# Select first character randomly.
seed = False
while seed == False:
    i = str.join('', (np.random.choice(alphabet, 1)))
    
    if i not in missing:
        seed = True

# Set length of message.
msg_len = 6000

# Initialize message.
msg = ''

# Build message.
for _ in range(0, msg_len):
    
    msg += i
    i = str.join('', (np.random.choice(alphabet, 1, p=df.loc[i].tolist())))

# Print message. 
print(msg)

foarea ia n so you am ay andothea wie e ould dot n t itheeerem nyot cam gggregserea ilikee you e s i wike nonoulli am in ami n i cothe cand e se yox am ot m am t yould re dou at lili li dot ywin fousathem ndox heay am inothem nyouldanoothe yothereeet ha ld tr dam gou saikegggra i whean cousa he t you in bou ike d in wi a wothem m am ait are il wor wikegsoulili them d d an ndot liat itheeem egsa ou ike gggse yoth anonou wox sanyousea in ld there sam e i ikemem e i ldox h wore caithemit i hand e ea gou beregs wi indothem a be remikem e e dou e lem tr the therethee sami lindou an d ikegou sathou t i m in he ink i t nyou the canou t i s me ld iand sanot wi ldo d li t t anywit youliken yodothe lld ikee o be ili thery ith theme emi nothem ike l eeam r winox se doth ld nd ldo noulit eaill a sa wo not a th e dot egou th inot wi wik ain d lld thai ike ildot sam li h t winyouli d n nothetherk i anou wouse t at t t yo mot ikem dam woulind ike be sand t t it ar t wousat i ldous nd ikegr a y ainyou