In [1]:
!pip install --quiet tensorflow pandas

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import re


In [2]:
df = pd.read_csv('/content/quotes.csv',nrows=100000)

# Trying to auto-detect column with quotes
for col in df.columns:
    if df[col].dtype == 'object' and df[col].str.len().mean() > 20:
        quote_col = col
        break

quotes = df[quote_col].astype(str)

# Cleaning non-alphabetical characters
def clean_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', text.lower())

cleaned_quotes = [clean_text(q) for q in quotes]


In [3]:
# Limit vocab size
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(cleaned_quotes)

# Creating input sequences efficiently
input_sequences = []
sequence_limit_per_quote = 10  # Limit n-grams per quote
total_sequence_limit = 100000  # Limit total sequences to prevent crash

for line in cleaned_quotes:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, min(len(token_list), sequence_limit_per_quote)):
        n_gram = token_list[:i+1]
        input_sequences.append(n_gram)

        # Stop if we hit total limit
        if len(input_sequences) >= total_sequence_limit:
            break
    if len(input_sequences) >= total_sequence_limit:
        break

# Pad sequences
max_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

# Split into X and y
input_sequences = np.array(input_sequences)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)



In [6]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_shape=(X.shape[1],)))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



  super().__init__(**kwargs)


In [10]:
with tf.device('/GPU:0'):
    history = model.fit(X, y, epochs=100, batch_size=128, verbose=1)


Epoch 1/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.6216 - loss: 1.6948
Epoch 2/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6216 - loss: 1.6963
Epoch 3/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6244 - loss: 1.6820
Epoch 4/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6258 - loss: 1.6816
Epoch 5/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6243 - loss: 1.6868
Epoch 6/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6233 - loss: 1.6886
Epoch 7/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6287 - loss: 1.6651
Epoch 8/100
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.6296 - loss: 1.6668
Epoch 9/100
[1m782/782[0m

In [15]:
# Reverse token index
index_word = {v: k for k, v in tokenizer.word_index.items()}

def predict_next_words(model, tokenizer, seed_text, next_words=15):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted = np.argmax(predicted_probs)

        output_word = index_word.get(predicted, '')
        if output_word == "<OOV>":
            break
        seed_text += " " + output_word
    return seed_text


In [16]:
user_input = input("Enter a starting phrase: ")
print("Generated Text:", predict_next_words(model, tokenizer, user_input))


Enter a starting phrase: life
Generated Text: life is not a matter of holding good cards but dont make living apart and better
