In [1]:
# Bible Verse Classification with Convolutional Neural Network (ConvNet)

# Import necessary libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Initialize stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Load the dataset with multiple translations
columns = ['verse_id', 'book_id', 'chapter', 'verse', 'text']
translations = ['t_kjv.csv', 't_asv.csv', 't_bbe.csv', 't_dby.csv', 't_wbt.csv', 't_web.csv', 't_ylt.csv']
translation_names = ['KJV', 'ASV', 'BBE', 'DBY', 'WBT', 'WEB', 'YLT']

all_translations = pd.DataFrame()
for file, name in zip(translations, translation_names):
    file_path = f'bible_data/bible_databases-master/bible_databases-master/csv/{file}'
    df = pd.read_csv(file_path)
    df.columns = columns
    df['translation'] = name
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    all_translations = pd.concat([all_translations, df], ignore_index=True)

# Merge with book metadata
key_english = pd.read_csv('bible_data/bible_databases-master/bible_databases-master/csv/key_english.csv')
key_english.columns = ['book_id', 'book_name', 'testament', 'genre']
all_translations = all_translations.merge(key_english, on='book_id', how='left')

# Extract features and labels
texts = all_translations['cleaned_text']
labels = all_translations['book_name']

# Tokenize text
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=100)

# Encode labels
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(labels)

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Build the ConvNet model
num_classes = len(label_encoder.classes_)
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=100),
    Conv1D(128, 5, activation='relu'),  # 128 filters, kernel size 5
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy (ConvNet): {accuracy}")

# Save the model
model.save('bible_classifier_cnn.keras')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skylercain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10




[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.2346 - loss: 2.9413 - val_accuracy: 0.5152 - val_loss: 1.6880
Epoch 2/10
[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.5124 - loss: 1.6983 - val_accuracy: 0.6007 - val_loss: 1.3783
Epoch 3/10
[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.6118 - loss: 1.3040 - val_accuracy: 0.6459 - val_loss: 1.2429
Epoch 4/10
[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.6787 - loss: 1.0545 - val_accuracy: 0.6742 - val_loss: 1.1740
Epoch 5/10
[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.7251 - loss: 0.8836 - val_accuracy: 0.6916 - val_loss: 1.1894
Epoch 6/10
[1m4355/4355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 8ms/step - accuracy: 0.7598 - loss: 0.7674 - val_accuracy: 0.7053 - val_loss: 1.2056
Epoch 7/10
[1m4355/4



Test Accuracy (ConvNet): 0.7309847474098206
