# Lesson Notebook
## NLP: A Closer Look at Markov Chains and Entropy 

Written by Shannon Binghan

### Notebook Description
This notebook contains code to build a Markov prediction machine with order 1.

## Set up environment.

In [None]:
# Import packages and modules.
import numpy  as np
import pandas as pd
import string

import warnings
warnings.filterwarnings('ignore')

# Increase number of columns that can be viewed in notebook.
pd.set_option('display.max_columns', 500)

np.random.seed(42)

## Load data.

In [None]:
# Load text data.
with open('./data/greeneggs.txt', 'r') as myfile:
    data=myfile.read().replace('\n', '')
    
# Take a look.
data

## Clean data.

In [None]:
# Replace ending punctu

In [None]:
# Set alphabet to include space and all lower case letters.
alphabet = list(" " + string.ascii_lowercase)

# Take a look.
alphabet[0:5]

In [None]:
# Strip out all characters that are not letters or spaces.
stripped_data = ''

# Loop through the data.
for c in data:
    if c.lower() in alphabet:
        stripped_data += c.lower()
        
# Take a look.
stripped_data

In [None]:
len(stripped_data)

## Calculate the bi-gram probabilities.

In [None]:
# Create a dataframe to contain bi-grams counts (where (row, col) is incremented for each character pair (i, j)) .
df = pd.DataFrame(0, index=alphabet, columns=alphabet)

# Take a look.
df.head()

In [None]:
# Update the bi-gram counts.

# Loop through the data.
for i in range(0,len(stripped_data) - 1):
    df.loc[stripped_data[i], stripped_data[i+1]] += 1
                     
# Take a look.
df.head()

In [None]:
# Convert counts to probabilities, where P(i, j) = count(i, j) / total(i)

# Loop through the df rows (i).
for i in alphabet:
    counts = df.loc[i].tolist()
    total = sum(counts)
    
    # Loop through the columns by index.
    for n in range(0, len(alphabet)):
        
        # Get column name.
        j = alphabet[n]

        # Update the entry.
        if total > 0:
            df.loc[i, j] = (counts[n] / total)
        else:
            df.loc[i, j] = 0
            
# Take a look.
df.head()

## Build words.

In [None]:
# Identify characters that did not appear in the data.
df[df.sum(axis=1) == 0]

In [None]:
# Save non-appearing character(s) in a list.
missing = list(df[df.sum(axis=1) == 0].index)

# Select first character randomly.
seed = False
while seed == False:
    i = str.join('', (np.random.choice(alphabet, 1)))
    
    if i not in missing:
        seed = True

# Set length of message.
msg_len = 6000

# Initialize message.
msg = ''

# Build message.
for _ in range(0, msg_len):
    
    msg += i
    i = str.join('', (np.random.choice(alphabet, 1, p=df.loc[i].tolist())))

# Print message. 
print(msg)