In [1]:
# import some useful packages for making an auto regression ML model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


# Load the CSV file into a DataFrame
df_sentences = pd.read_csv('cv-unique-no-end-punct-sentences.csv')

# remove the first column (since it is just the index)
df_sentences = df_sentences.drop(df_sentences.columns[0], axis=1)

# make everything lowercase
df_sentences = df_sentences.apply(lambda x: x.astype(str).str.lower())

# Display the first few rows of the DataFrame
print(df_sentences.head())

                                            sentence
0  he was accorded a state funeral and was buried...
1  in american english whilst is considered to be...
2  once again she is seen performing on a compute...
3     hippety hopper returns in mckimsons pop im pop
4  today their programs are available on the inte...


In [4]:
sentences = df_sentences.values.flatten()

# use only the first 1000 sentences for training
sentences = sentences[:1000]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

# Convert sentences into sequences of integers
input_sequences = []
for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to make them uniform length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split predictors (X) and labels (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = np.eye(total_words)[y]  # One-hot encode the labels

# Define the model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, verbose=1)

# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    return predicted_word

# Example usage
seed_text = "The quick brown fox"
next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
print("Predicted next word:", next_word)



Epoch 1/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.0425 - loss: 7.4331
Epoch 2/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.0525 - loss: 6.6346
Epoch 3/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.0571 - loss: 6.4302
Epoch 4/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.0714 - loss: 6.2540
Epoch 5/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.0867 - loss: 6.0089
Epoch 6/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.0901 - loss: 5.8293
Epoch 7/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.1085 - loss: 5.6252
Epoch 8/20
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.1227 - loss: 5.4252
Epoch 9/20
[1m272/272[0m [32m

In [5]:
def predict_100_words(given_text):
    for _ in range(100):
        given_text += " " + predict_next_word(model, tokenizer, given_text, max_sequence_len)
    return given_text

test_text = predict_100_words("The quick brown fox")

# So this model is just a bigram model. Next steps could be to make also a 3-gram model, 4-gram model, etc.
# and then combine them all together to make a more accurate model.
# Then, start with 2-gram model to predict 2nd word, 3-gram model to predict 3rd word, etc.

In [6]:
print(test_text)

The quick brown fox was all released on the snow and narrower a collision and hall on a collision on in the sand improvements watch on a national park of the sand mound buchan their campaigns in japan dakota on a race on in the sand mound poison their campaigns in a race railway team team of person on a car entered on a car user cluster their expansion user on a car entered on a street in a red shirt underneath a team and hall and alan roscoe on is a drug user on a pad in a red shirt and narrower on


In [17]:
# get list of all unique words in the by spliting all sentences into words and then getting the unique words
unique_words = df_sentences['sentence'].str.split(expand=True).stack().unique()

# sort the unique words
unique_words.sort()