In [1]:
# Bible Verse Classification using LSTM with Early Stopping

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
df = pd.read_csv("data/kjvdata.csv")

# Mapping book to author
author_list = {
    "Genesis": "Moses",
    "Exodus": "Moses",
    "Leviticus": "Moses",
    "Numbers": "Moses",
    "Deuteronomy": "Moses",
    "Joshua": "Joshua",
    "Judges": "Samuel, Nathan, Gad",
    "Ruth": "Samuel, Nathan, Gad",
    "1 Samuel (1 Kings)": "Samuel, Nathan, Gad",
    "2 Samuel (2 Kings)": "Samuel, Nathan, Gad",
    "1 Kings (3 Kings)": "Jeremiah",
    "2 Kings (4 Kings)": "Jeremiah",
    "1 Chronicles": "Ezra",
    "2 Chronicles": "Ezra",
    "Ezra": "Ezra",
    "Nehemiah": "Nehemiah, Ezra",
    "Esther": "Mordecai",
    "Job": "Job,Moses",
    "Psalms": "David,Asaph, Ezra, the sons of Korah, Heman, Ethan, Moses",
    "Proverbs": "Solomon ,Agur(30) and Lemuel(31)",
    "Ecclesiastes": "Solomon",
    "Song of Solomon (Canticles)": "Solomon",
    "Isaiah": "Isaiah",
    "Jeremiah": "Jeremiah",
    "Lamentations": "Jeremiah",
    "Ezekiel": "Ezekiel",
    "Daniel": "Daniel",
    "Hosea": "Hosea",
    "Joel": "Joel",
    "Amos": "Amos",
    "Obadiah": "Obadiah",
    "Jonah": "Jonah",
    "Micah": "Micah",
    "Nahum": "Nahum",
    "Habakkuk": "Habakkuk",
    "Zephaniah": "Zephaniah",
    "Haggai": "Haggai",
    "Zechariah": " Zechariah",
    "Malachi": "Malachi",
    "Matthew": "Matthew",
    "Mark": "John Mark",
    "Luke": "Luke",
    "John": "John, the Apostle",
    "Acts": "Luke",
    "Romans": "Paul",
    "1 Corinthians": "Paul",
    "2 Corinthians": "Paul",
    "Galatians": "Paul",
    "Ephesians": "Paul",
    "Philippians": "Paul",
    "Colossians": "Paul",
    "1 Thessalonians": "Paul",
    "2 Thessalonians": "Paul",
    "1 Timothy": "Paul",
    "2 Timothy": "Paul",
    "Titus": "Paul",
    "Philemon": "Paul",
    "Hebrews": "Paul, Luke, Barnabas, Apollos",
    "James": "James the brother of Jesus and Jude (not the Apostle, brother of John).",
    "1 Peter": "Peter",
    "2 Peter": "Peter",
    "1 John": "John, the Apostle",
    "2 John": "John, the Apostle",
    "3 John": "John, the Apostle",
    "Jude": "Jude, the brother of Jesus",
    "Revelation": "John, the Apostle"
}

df['author'] = df['book'].map(author_list)

# Prepare features and labels
x_features = df['text']
y_labels = df['author']

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_features)
sequences = tokenizer.texts_to_sequences(x_features)

# Padding sequences
max_sequence_length = 100
x_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode labels
encoder = LabelEncoder()
y_labels_enc = encoder.fit_transform(y_labels)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_padded, y_labels_enc, test_size=0.33, random_state=42)

# Build LSTM model
vocab_size = len(tokenizer.word_index) + 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model
history = model.fit(x_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate model
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


Epoch 1/20




[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 64ms/step - accuracy: 0.1869 - loss: 3.0012 - val_accuracy: 0.3575 - val_loss: 2.2356
Epoch 2/20
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 67ms/step - accuracy: 0.4139 - loss: 2.0112 - val_accuracy: 0.4683 - val_loss: 1.7999
Epoch 3/20
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 66ms/step - accuracy: 0.5530 - loss: 1.4866 - val_accuracy: 0.4990 - val_loss: 1.7022
Epoch 4/20
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 61ms/step - accuracy: 0.6240 - loss: 1.2320 - val_accuracy: 0.5098 - val_loss: 1.6779
Epoch 5/20
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 63ms/step - accuracy: 0.6847 - loss: 1.0193 - val_accuracy: 0.5185 - val_loss: 1.7562
Epoch 6/20
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 63ms/step - accuracy: 0.7414 - loss: 0.8543 - val_accuracy: 0.5333 - val_loss: 1.8518
Epoch 7/20
[1m521/521[0m 