In [1]:
import os
import re
import numpy as np
from IPython.display import HTML
from jupyterthemes import jtplot
jtplot.style()

In [131]:
def parse_observations(text):
    # Convert text to dataset.
    lines = [line for line in text.split('\n') if line.split()]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        if len(line.split()) == 1:
            if len(obs_elem) > 0:
                obs.append(obs_elem)
            obs_elem = []
        else:
            line = str(line)
            line = re.findall(r"[\w'^]+|[.,!?;-]", re.sub("'", '^', line))

            for word in line:
                word = re.sub("'", '', word)
                word = word.lower()
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])
        
    # Add the encoded sequence.
    if len(obs_elem) > 0:
        obs.append(obs_elem)

    return obs, obs_map

In [127]:
text = "   1\n Sharon is talking to Sabera.\n George is not here yet and we aren't going to start. 'Tis a fun day--hey!\n\n Hello, world!\n      2   \nThis is a happy-dance. I love you!!!"

In [128]:
obs, obs_map = parse_observations(text)
obs_map

   1
      2   


{'sharon': 0,
 'is': 1,
 'talking': 2,
 'to': 3,
 'sabera': 4,
 '.': 5,
 'george': 6,
 'not': 7,
 'here': 8,
 'yet': 9,
 'and': 10,
 'we': 11,
 'aren^t': 12,
 'going': 13,
 'start': 14,
 '^tis': 15,
 'a': 16,
 'fun': 17,
 'day': 18,
 '-': 19,
 'hey': 20,
 '!': 21,
 'hello': 22,
 ',': 23,
 'world': 24,
 'this': 25,
 'happy': 26,
 'dance': 27,
 'i': 28,
 'love': 29,
 'you': 30}

In [125]:
obs

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  1,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  3,
  14,
  5,
  15,
  16,
  17,
  18,
  19,
  19,
  20,
  21,
  22,
  23,
  24,
  21],
 [25, 1, 16, 26, 19, 27, 5, 28, 29, 30, 21, 21, 21]]

In [None]:
line = re.split(r'[\s-]+', str(text))

In [None]:
line

In [None]:
re.findall(r"[\s']+|[.,!?;-]", text)

In [None]:
line = re.split(r"[\w']+|[.,!?;-]", text)

In [None]:
line

In [38]:
text = "Sharon is talking to Sabera.\n George is not here yet and we aren't going to start. 'Tis a fun day--hey!\n\n Hello, world! This is a happy-dance. I love you!!!"
line = text.split('.')
line

['Sharon is talking to Sabera',
 "\n George is not here yet and we aren't going to start",
 " 'Tis a fun day--hey!\n\n Hello, world! This is a happy-dance",
 ' I love you!!!']

In [75]:
text = "Sharon is talking to Sabera.\n George is not here yet and we aren't going to start. 'Tis a fun day--hey!\n\n Hello, world! This is a happy-dance. I love you!!!"
line = re.sub("'", '^', text)
line = re.findall(r"[\w^]+|[.,!?;-]", line)
line

['Sharon',
 'is',
 'talking',
 'to',
 'Sabera',
 '.',
 'George',
 'is',
 'not',
 'here',
 'yet',
 'and',
 'we',
 'aren^t',
 'going',
 'to',
 'start',
 '.',
 '^Tis',
 'a',
 'fun',
 'day',
 '-',
 '-',
 'hey',
 '!',
 'Hello',
 ',',
 'world',
 '!',
 'This',
 'is',
 'a',
 'happy',
 '-',
 'dance',
 '.',
 'I',
 'love',
 'you',
 '!',
 '!',
 '!']

In [132]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

In [133]:
obs, obs_map = parse_observations(text)

In [134]:
obs_map

{'from': 0,
 'fairest': 1,
 'creatures': 2,
 'we': 3,
 'desire': 4,
 'increase': 5,
 ',': 6,
 'that': 7,
 'thereby': 8,
 'beauty^s': 9,
 'rose': 10,
 'might': 11,
 'never': 12,
 'die': 13,
 'but': 14,
 'as': 15,
 'the': 16,
 'riper': 17,
 'should': 18,
 'by': 19,
 'time': 20,
 'decease': 21,
 'his': 22,
 'tender': 23,
 'heir': 24,
 'bear': 25,
 'memory': 26,
 'thou': 27,
 'contracted': 28,
 'to': 29,
 'thine': 30,
 'own': 31,
 'bright': 32,
 'eyes': 33,
 'feed^st': 34,
 'thy': 35,
 'light^s': 36,
 'flame': 37,
 'with': 38,
 'self': 39,
 '-': 40,
 'substantial': 41,
 'fuel': 42,
 'making': 43,
 'a': 44,
 'famine': 45,
 'where': 46,
 'abundance': 47,
 'lies': 48,
 'foe': 49,
 'sweet': 50,
 'too': 51,
 'cruel': 52,
 'art': 53,
 'now': 54,
 'world^s': 55,
 'fresh': 56,
 'ornament': 57,
 'and': 58,
 'only': 59,
 'herald': 60,
 'gaudy': 61,
 'spring': 62,
 'within': 63,
 'bud': 64,
 'buriest': 65,
 'content': 66,
 'churl': 67,
 'mak^st': 68,
 'waste': 69,
 'in': 70,
 'niggarding': 71,
 'pity

In [137]:
obs[0][-1]

83