In [1]:
"""
The first sections in this chapter will explain the LSTM architecture, a lighter-weight version
called the gated recurrent unit (GRU), the key ideas behind bidirectional RNNs and a brief
explanation of how RNN layers are stacked together to form deep RNNs
"""
"""lSTMs resemble standard
recurrent neural networks but here each ordinary recurrent node is replaced by a memory
cell. Each memory cell contains an internal state, i.e., a node with a self-connected re-
current edge of fixed weight"""


'lSTMs resemble standard\nrecurrent neural networks but here each ordinary recurrent node is replaced by a memory\ncell. Each memory cell contains an internal state, i.e., a node with a self-connected re-\ncurrent edge of fixed weight'

In [2]:
# The term “long short-term memory” comes from the following intuition. Simple recurrent
# neural networks have long-term memory in the form of weights. The weights change slowly
# during training, encoding general knowledge about the data. They also have short-term
# memory in the form of ephemeral activations

LSTM      Implementation from Scratch

"Next word prediction"

In [1]:
import pandas as pd 
import numpy as  np

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import spacy
nlp = spacy.load("en_core_web_md")

2025-03-02 02:06:52.124316: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 02:06:52.355191: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 02:06:52.550765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740861412.712550   20070 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740861412.757076   20070 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-02 02:06:53.105576: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [5]:
df = pd.read_csv('~/Documents/medium_data.csv')
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [8]:
df["title"]
df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0',u' '))
df['title'] = df['title'].apply(lambda x: x.replace('\u200a',' '))

In [9]:
titles_list = df['title'].tolist()

In [10]:
docs = list(nlp.pipe(titles_list))

In [11]:
docs[0]

A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model

In [15]:
df['title_tokens'] = df['title'].apply(lambda text: [token.text for token in nlp(text)]if isinstance(text, str) else [])

In [38]:
unique_tokens = set()
for doc in docs:
    for token in doc:
        # Discard punctuation, symbols, or whitespace
        if not token.is_punct and not token.is_space and not token.is_currency and not token.like_num:
            unique_tokens.add(token.text.lower())

In [39]:
sorted_tokens = sorted(unique_tokens)


In [41]:
oov_token="<oov>"
vocab_index = {oov_token: 1}
for i, token in enumerate(sorted_tokens, start=2):
    vocab_index[token] = i

In [44]:
total_words = len(vocab_index) + 1

In [45]:
total_words

8186

In [46]:
print("Word: ID")
print("---------------")
print("<oov>:", vocab_index.get("<oov>"))
print("strong:", vocab_index.get("strong"))
print("and:", vocab_index.get("and"))
print("consumption:", vocab_index.get("consumption"))

Word: ID
---------------
<oov>: 1
strong: 6844
and: 331
consumption: 1521


In [28]:
import pandas as pd
import spacy

def build_spacy_vocab(texts, oov_token="<oov>", model="en_core_web_md"):
    """
    Build a vocabulary (word-to-ID mapping) from a list of text strings using spaCy.

    Parameters:
      texts (list of str): A list of text strings.
      oov_token (str): Token name to reserve for out-of-vocabulary words.
      model (str): Name of the spaCy model to load (e.g., 'en_core_web_md').

    Returns:
      dict: A dictionary mapping each unique token to an integer ID.
    """
    # Load spaCy model
    nlp = spacy.load(model)

    # Process texts in batches using spaCy's pipe for efficiency
    docs = list(nlp.pipe(texts))

    # Collect unique tokens
    unique_tokens = set()
    for doc in docs:
        for token in doc:
            # Discard punctuation, symbols, or whitespace
            if not token.is_punct and not token.is_space and not token.is_currency and not token.like_num:
                unique_tokens.add(token.text.lower())

    # Convert set to a sorted list for reproducibility
    sorted_tokens = sorted(unique_tokens)

    # Start indexing from 2 so that 1 is reserved for OOV
    vocab_index = {oov_token: 1}
    for i, token in enumerate(sorted_tokens, start=2):
        vocab_index[token] = i

    return vocab_index

# ---------------------------------------------------------------------
# Main script using DataFrame from medium_data.csv:
# ---------------------------------------------------------------------

# 1. Load the DataFrame
df = pd.read_csv('~/Documents/medium_data.csv')

# 2. Clean/normalize the "title" column if necessary
df['title'] = df['title'].fillna('')  # Replace NaN with empty strings
df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0', ' '))
df['title'] = df['title'].apply(lambda x: x.replace(u'\u200a', ' '))

# 3. Get all titles as a list of strings
titles_list = df['title'].tolist()

# 4. Build the spaCy vocabulary from the titles
vocab_index = build_spacy_vocab(titles_list, oov_token="<oov>", model="en_core_web_md")

# 5. Print total number of words (excluding the <oov> token)
total_words = len(vocab_index) +1
print("Total number of words:", total_words)

# 6. Print word IDs for selected tokens
print("Word: ID")
print("---------------")
print("<oov>:", vocab_index.get("<oov>"))
print("strong:", vocab_index.get("strong"))
print("and:", vocab_index.get("and"))
print("consumption:", vocab_index.get("consumption"))

# Created/Modified files during execution:
# (None)

Total number of words: 8186
Word: ID
---------------
<oov>: 1
strong: 6844
and: 331
consumption: 1521


'm: 2
're: 3
's: 4
+: 5
-#1: 6
-chan: 7
-determining: 8
-emr: 9
-introducing: 10
.com: 11
.crypto: 12
.net: 13
.rodata: 14
.rwdata-: 15
//error//: 16
/strong: 17
1)</strong: 18
10x: 19
11k: 20
12k: 21
13.5h: 22
15min: 23
1959</strong: 24
1980s: 25
1983–2019: 26
19b: 27
1</strong: 28
1b: 29
2)</strong: 30
2000s: 31
2010s: 32
2018–2027: 33
2019</strong: 34
2019–2026</strong: 35
2020s: 36
20s: 37
2d: 38
2fa: 39
2–3: 40
3(introduction: 41
30x: 42
3d: 43
401(k)s: 44
401k: 45
40x: 46
4chan: 47
5.7x: 48
5b: 49
60s: 50
8pt: 51
9–5: 52
<: 53
</strong: 54
<oov>: 1
=: 55
>: 56
@london: 57
`: 58
a: 59
a.i.: 60
aapl: 61
abandoned: 62
abhishek: 63
abilities: 64
ability: 65
able: 66
abnormal: 67
abortion</strong: 68
about: 69
above: 70
abraham: 71
abroad: 72
absolute: 73
absolutism: 74
abstract: 75
abundance: 76
abuse: 77
abusive: 78
academia: 79
academic: 80
accelerate: 81
accelerated: 82
accelerator: 83
accelerators</strong: 84
accents: 85
accept: 86
acceptance: 87
accepting: 88
access: 89
accessib

In [13]:
import pandas as pd
import spacy

def build_spacy_vocab(texts, oov_token="<oov>", model="en_core_web_md"):
    """
    Build a vocabulary (word-to-ID mapping) from a list of text strings using spaCy.

    Parameters:
      texts (list of str): A list of text strings.
      oov_token (str): Token name to reserve for out-of-vocabulary words.
      model (str): Name of the spaCy model to load (e.g., 'en_core_web_md').

    Returns:
      dict: A dictionary mapping each unique token to an integer ID.
      nlp: spaCy language pipeline object (to reuse later).
    """
    # Load spaCy model
    nlp = spacy.load(model)

    # Process texts in batches using spaCy's pipe for efficiency
    docs = list(nlp.pipe(texts))

    # Collect unique tokens
    unique_tokens = set()
    for doc in docs:
        for token in doc:
            # Discard punctuation, symbols, or whitespace
            if not token.is_punct and not token.is_space and not token.is_currency and not token.like_num:
                unique_tokens.add(token.text.lower())

    # Convert set to a sorted list for reproducibility
    sorted_tokens = sorted(unique_tokens)

    # Start indexing from 2 so that 1 is reserved for OOV
    vocab_index = {oov_token: 1}
    for i, token in enumerate(sorted_tokens, start=2):
        vocab_index[token] = i

    return vocab_index, nlp

def text_to_sentences_indices(text, nlp, vocab_index, oov_token="<oov>"):
    """
    Given a single text string, split it into sentences using spaCy
    and convert each token to its corresponding integer ID.

    Returns a list of lists of integer IDs (one for each sentence).
    """
    if not isinstance(text, str) or not text.strip():
        return []

    doc = nlp(text)
    all_sentences_indices = []

    for sent in doc.sents:
        sent_indices = []
        for token in sent:
            # Exclude punctuation, spaces, etc.
            if not token.is_punct and not token.is_space and not token.is_currency and not token.like_num:
                lower_word = token.text.lower()
                token_id = vocab_index.get(lower_word, vocab_index[oov_token])
                sent_indices.append(token_id)
        if sent_indices:
            all_sentences_indices.append(sent_indices)

    return all_sentences_indices

# ---------------------------------------------------------------------
# Main script with DataFrame from medium_data.csv
# ---------------------------------------------------------------------

# 1. Load DataFrame
df = pd.read_csv('~/Documents/medium_data.csv')

# 2. Clean/normalize the "title" column if necessary
df['title'] = df['title'].fillna('')  # Replace NaN with empty strings
df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0', ' '))
df['title'] = df['title'].apply(lambda x: x.replace(u'\u200a', ' '))

# 3. Get all titles as a list of strings
titles_list = df['title'].tolist()

# 4. Build the spaCy vocabulary (vocab_index) and get the spaCy model pipeline
vocab_index, nlp = build_spacy_vocab(titles_list, oov_token="<oov>", model="en_core_web_md")

# 5. Create a new column that splits each title into sentences
#    and maps each token to its vocabulary index
df["title_sentences_word_indices"] = df["title"].apply(
    lambda text: text_to_sentences_indices(text, nlp, vocab_index, oov_token="<oov>")
)



In [15]:
titles_sentences_list=(df["title_sentences_word_indices"].tolist())
input_sequence = []

In [16]:
for row_sentences in df["title_sentences_word_indices"]:
    for sentence_indices in row_sentences:
        for i in range(1, len(sentence_indices)):
            partial_seq = sentence_indices[:i+1]
            input_sequence.append(partial_seq)


In [17]:
print("Number of partial sequences:", len(input_sequence))

Number of partial sequences: 46685


In [18]:
print("First 5 partial sequences:", input_sequence[:5])

First 5 partial sequences: [[59, 699], [59, 699, 8163], [59, 699, 8163, 3157], [59, 699, 8163, 3157, 7403], [59, 699, 8163, 3157, 7403, 8040]]


In [19]:
input_sequence

[[59, 699],
 [59, 699, 8163],
 [59, 699, 8163, 3157],
 [59, 699, 8163, 3157, 7403],
 [59, 699, 8163, 3157, 7403, 8040],
 [59, 699, 8163, 3157, 7403, 8040, 2333],
 [59, 699, 8163, 3157, 7403, 8040, 2333, 8022],
 [59, 699, 8163, 3157, 7403, 8040, 2333, 8022, 2990],
 [59, 699, 8163, 3157, 7403, 8040, 2333, 8022, 2990, 8041],
 [59, 699, 8163, 3157, 7403, 8040, 2333, 8022, 2990, 8041, 4559],
 [3198, 4944],
 [3198, 4944, 3105],
 [3198, 4944, 3105, 4771],
 [3198, 4944, 3105, 4771, 4767],
 [3198, 4944, 3105, 4771, 4767, 8022],
 [3198, 4944, 3105, 4771, 4767, 8022, 5679],
 [3198, 4944, 3105, 4771, 4767, 8022, 5679, 5679],
 [3198, 4944, 3105, 4771, 4767, 8022, 5679, 5679, 2995],
 [3382, 7403],
 [3382, 7403, 7738],
 [3382, 7403, 7738, 3010],
 [3382, 7403, 7738, 3010, 3537],
 [3382, 7403, 7738, 3010, 3537, 5677],
 [1785, 3382],
 [1785, 3382, 7403],
 [1785, 3382, 7403, 6205],
 [1785, 3382, 7403, 6205, 2727],
 [1785, 3382, 7403, 6205, 2727, 3537],
 [1785, 3382, 7403, 6205, 2727, 3537, 1700],
 [1785,

In [20]:
max_len=max([len(x) for x in input_sequence])

In [21]:
max_len

28

In [23]:
padded_input_sequences=pad_sequences(input_sequence,maxlen=max_len,padding='pre')
padded_input_sequences

array([[   0,    0,    0, ...,    0,   59,  699],
       [   0,    0,    0, ...,   59,  699, 8163],
       [   0,    0,    0, ...,  699, 8163, 3157],
       ...,
       [   0,    0,    0, ...,   59, 3070,  980],
       [   0,    0,    0, ..., 3070,  980,  814],
       [   0,    0,    0, ...,  980,  814, 5430]], dtype=int32)

In [25]:
x=padded_input_sequences[:,:-1]

In [26]:
y=padded_input_sequences[:,-1]

In [27]:
print(x.shape)
print(y.shape)

(46685, 27)
(46685,)


In [35]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=total_words)

MemoryError: Unable to allocate 22.8 TiB for an array with shape (382163410, 8186) and data type float64

Multiclass classification 

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=total_words,
                    output_dim=100,
                    input_length=max_len,
                    input_shape=(max_len,)))  # <-- specify input_shape
model.add(LSTM(500, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.summary()




  super().__init__(**kwargs)


In [49]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

When you see 0 trainable parameters in the model summary, it usually means that Keras has not fully "built" your model yet, and so it doesn’t know the shapes of the weights. By default, Keras will attempt to infer layer shapes when you call model.summary(), but if it can’t, the displayed parameter counts remain zero.

Not specifying the input shape on the first layer in a way that Keras can infer it.
Not building or calling the model on a sample input before calling model.summary().
Ensure the very first layer (the Embedding layer) has a clear input_shape. For a sequence model, if you know your sequences have length max_len, add input_shape=(max_len,) instead of just input_length=max_len. For example:
You might see input_length in many Keras code examples, but if Keras can’t infer shapes internally, directly providing input_shape=(max_len,) in the first layer clarifies it.

In [50]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("next_words.keras", monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
model.fit(x,y,epochs=20,validation_split=0.2,batch_size=64, callbacks=[checkpoint])

Epoch 1/20
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step - accuracy: 0.0367 - loss: 7.2148
Epoch 1: val_loss improved from inf to 7.31602, saving model to next_words.keras
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 295ms/step - accuracy: 0.0367 - loss: 7.2148 - val_accuracy: 0.0397 - val_loss: 7.3160
Epoch 2/20
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step - accuracy: 0.0452 - loss: 6.9627
Epoch 2: val_loss improved from 7.31602 to 7.09186, saving model to next_words.keras
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 295ms/step - accuracy: 0.0452 - loss: 6.9626 - val_accuracy: 0.0742 - val_loss: 7.0919
Epoch 3/20
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step - accuracy: 0.0865 - loss: 6.5564
Epoch 3: val_loss improved from 7.09186 to 6.96329, saving model to next_words.keras
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 326ms