In [74]:
import os
import re
import numpy as np
from IPython.display import HTML
from jupyterthemes import jtplot
jtplot.style()
import urllib3

In [72]:
def parse_observations_sonnets(text, obs=[], obs_map={}):
    # Convert text to dataset.
    lines = [line for line in text.split('\n') if line.split()]

    obs_counter = len(obs_map)
    obs_elem = []

    for line in lines:
        if len(line.split()) == 1:
            if len(obs_elem) > 0:
                obs.append(obs_elem)
            obs_elem = []
        else:
            line = str(line)
            line = re.findall(r"[\w'^-]+|[.,!?;]", re.sub("'", '^', line))

            for word in line:
                word = re.sub("'", '', word)
                word = word.lower()
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])
        
    # Add the encoded sequence.
    if len(obs_elem) > 0:
        obs.append(obs_elem)

    return obs, obs_map

def parse_observations_stanza(text, obs=[], obs_map={}):
    # Convert text to dataset.
    lines = [line for line in text.split('\n') if line.split()]

    obs_counter = len(obs_map)
    obs_elem = []

    for ln, line in enumerate(lines):
        if ln % 14 == 0:
            if len(obs_elem) > 0:
                obs.append(obs_elem)
            obs_elem = []
            
        line = str(line)
        line = re.findall(r"[\w'^]+|[.,!?;-]", re.sub("'", '^', line))

        for word in line:
            word = re.sub("'", '', word)
            word = word.lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1

            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
    # Add the encoded sequence.
    if len(obs_elem) > 0:
        obs.append(obs_elem)

    return obs, obs_map

def parse_observations_poem(text, obs=[], obs_map={}):
    # Convert text to dataset.
    lines = [line for line in text.split('\n') if line.split()]

    obs_counter = len(obs_map)
    obs_elem = []

    for ln, line in enumerate(lines):            
        line = str(line)
        line = re.findall(r"[\w'^]+|[.,!?;-]", re.sub("'", '^', line))

        for word in line:
            word = re.sub("'", '', word)
            word = word.lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1

            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
    # Add the encoded sequence.
    if len(obs_elem) > 0:
        obs.append(obs_elem)

    return obs, obs_map

def parse_observations_lyrics(text, obs=[], obs_map={}):
    # Convert text to dataset.
    text = re.sub(r'(?<!\w)([A-Z])\.', r'\1', text) # remove dots in acronyms
    lines = [line for line in text.split('\r\n')]

    obs_counter = len(obs_map)
    obs_elem = []

    for line in lines:
        if len(line.split()) < 1:
            if len(obs_elem) > 0:
                obs.append(obs_elem)
            obs_elem = []
        else:
            line = str(line)
#             line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.findall(r"[\w'^-]+|[.,!?;]", re.sub("'", '^', line))

            for word in line:
                word = re.sub("'", '', word)
                word = word.lower()
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])
        
    # Add the encoded sequence.
    if len(obs_elem) > 0:
        obs.append(obs_elem)

    return obs, obs_map

def parse_observations(sonnets, sonnets2="", poem3="", poem4="", lyrics=""):
    obs, obs_map = parse_observations_sonnets(sonnets, [], {})
    obs, obs_map = parse_observations_sonnets(sonnets2, obs, obs_map)
    obs, obs_map = parse_observations_stanza(poem3, obs, obs_map)
    obs, obs_map = parse_observations_stanza(poem4, obs, obs_map)
    obs, obs_map = parse_observations_lyrics(lyrics, obs, obs_map)
    return obs, obs_map

def print_words(obs, obs_map):
    val_map = dict([(value, key) for key, value in obs_map.items()]) 
    for el in obs:
        print(val_map[el])

In [24]:
text = "   1\n Sharon is talking to Sabera.\n George is not here yet and we aren't going to start. 'Tis a fun day--hey!\n\n Hello, world!\n      2   \nThis is a happy-dance. I love you!!! [Kanye]"

In [25]:
obs, obs_map = parse_observations(text)
obs_map

{'sharon': 0,
 'is': 1,
 'talking': 2,
 'to': 3,
 'sabera': 4,
 '.': 5,
 'george': 6,
 'not': 7,
 'here': 8,
 'yet': 9,
 'and': 10,
 'we': 11,
 'aren^t': 12,
 'going': 13,
 'start': 14,
 '^tis': 15,
 'a': 16,
 'fun': 17,
 'day--hey': 18,
 '!': 19,
 'hello': 20,
 ',': 21,
 'world': 22,
 'this': 23,
 'happy-dance': 24,
 'i': 25,
 'love': 26,
 'you': 27,
 'kanye': 28}

In [9]:
obs

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  1,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  3,
  14,
  5,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  19],
 [23, 1, 16, 24, 5, 25, 26, 27, 19, 19, 19],
 [0,
  1,
  2,
  3,
  4,
  5,
  6,
  1,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  3,
  14,
  5,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  19],
 [23, 1, 16, 24, 5, 25, 26, 27, 19, 19, 19]]

In [79]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
obs, obs_map = parse_observations_sonnets(text, [], {})
print(len(obs))

154


In [80]:
sonnets2 = open(os.path.join(os.getcwd(), 'data/spenser.txt')).read()
obs2, obs_map2 = parse_observations_sonnets(sonnets2, [], {})
print(len(obs2))
poem3 = open(os.path.join(os.getcwd(), 'data/poem3.txt')).read()
obs3, obs_map3 = parse_observations_stanza(poem3, [], {})
print(len(obs3))
poem4 = open(os.path.join(os.getcwd(), 'data/poem4.txt')).read()
obs4, obs_map4 = parse_observations_stanza(poem4, [], {})
print(len(obs4))

target_url = "https://pastebin.com/raw/9S5u08EU" # Kanye lyrics
http = urllib3.PoolManager()
response = http.request('GET', target_url)
lyrics = response.data.decode('utf-8')
obs5, obs_map5 = parse_observations_lyrics(lyrics, [], {})
print(len(obs5))

print(len(obs) + len(obs2) + len(obs3) + len(obs4) + len(obs5))

89
24
133
363
763




In [76]:
sonnets = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
sonnets2 = open(os.path.join(os.getcwd(), 'data/spenser.txt')).read()
poem3 = open(os.path.join(os.getcwd(), 'data/poem3.txt')).read()
poem4 = open(os.path.join(os.getcwd(), 'data/poem4.txt')).read()

target_url = "https://pastebin.com/raw/9S5u08EU" # Kanye lyrics
http = urllib3.PoolManager()
response = http.request('GET', target_url)
lyrics = response.data.decode('utf-8')

obs_final, obs_map_final = parse_observations(sonnets, sonnets2, poem3, poem4, lyrics)



In [77]:
print_words(obs_final[0], obs_map_final)

from
fairest
creatures
we
desire
increase
,
that
thereby
beauty^s
rose
might
never
die
,
but
as
the
riper
should
by
time
decease
,
his
tender
heir
might
bear
his
memory
but
thou
contracted
to
thine
own
bright
eyes
,
feed^st
thy
light^s
flame
with
self-substantial
fuel
,
making
a
famine
where
abundance
lies
,
thy
self
thy
foe
,
to
thy
sweet
self
too
cruel
thou
that
art
now
the
world^s
fresh
ornament
,
and
only
herald
to
the
gaudy
spring
,
within
thine
own
bud
buriest
thy
content
,
and
tender
churl
mak^st
waste
in
niggarding
pity
the
world
,
or
else
this
glutton
be
,
to
eat
the
world^s
due
,
by
the
grave
and
thee
.


In [78]:
len(obs_final)

763

