## Notebook for a ML/AI playground

### Unpickle, load, and clean data 

In [20]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [2]:
df = pd.read_pickle('data/quotes.pkl')

In [3]:
df = df.drop(['Author', 'Tags'], axis=1)


In [4]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [5]:
quotes = df['Quote'].to_list()

In [6]:
concatted_quotes = " ".join(quotes)

### Clean data

In [7]:
chars = sorted(list(set(concatted_quotes)))

In [8]:
c = ['\n', '\r', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xbb', '\xbf', '\xef']


In [9]:
# [f(x) for x in sequence if condition]
updated_quotes = [x for x in concatted_quotes if x in c] 

In [10]:
concatted_quotes[:20]

"don't cry because it"

In [11]:
chars = sorted(list(set(updated_quotes)))

In [12]:
updated_quotes = ''.join(updated_quotes)
updated_quotes



### Tokenize the data

In [16]:

n_chars = len(updated_quotes)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)


Total Characters:  6596118
Total Vocab:  45


In [17]:
chars = sorted(list(set(updated_quotes)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
char_to_int

{' ': 0,
 '!': 1,
 '"': 2,
 "'": 3,
 '(': 4,
 ')': 5,
 '*': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 '[': 13,
 ']': 14,
 '_': 15,
 'a': 16,
 'b': 17,
 'c': 18,
 'd': 19,
 'e': 20,
 'f': 21,
 'g': 22,
 'h': 23,
 'i': 24,
 'j': 25,
 'k': 26,
 'l': 27,
 'm': 28,
 'n': 29,
 'o': 30,
 'p': 31,
 'q': 32,
 'r': 33,
 's': 34,
 't': 35,
 'u': 36,
 'v': 37,
 'w': 38,
 'x': 39,
 'y': 40,
 'z': 41,
 '»': 42,
 '¿': 43,
 'ï': 44}

In [18]:

seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = updated_quotes[i:i + seq_length]
    seq_out = updated_quotes[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  6596018


In [21]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

In [23]:
y[:10]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 

### Create the model

In [24]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

NotImplementedError: Cannot convert a symbolic Tensor (lstm/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported