In [14]:
import pandas as pd

# columns
columns = ['verse_id', 'book_id', 'chapter', 'verse', 'text']


# Example with KJV CSV file
kjv = pd.read_csv('bible_data/bible_databases-master/bible_databases-master/csv/t_kjv.csv')
kjv.columns = columns

key_english = pd.read_csv('bible_data/bible_databases-master/bible_databases-master/csv/key_english.csv')
key_english.columns = ['book_id', 'book_name', 'testament', 'genre']

kjv = kjv.merge(key_english, on='book_id', how='left', suffixes=(None, '_key'))

import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

kjv['cleaned_text'] = kjv['text'].apply(preprocess_text)

# Extract features and labels
texts = kjv['cleaned_text']
labels = kjv['book_name']

# Encode labels into integers
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(texts, labels_encoded, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Limit to 5000 most common words
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(x_train_vectorized, y_train)

# Evaluate the model
y_pred = nb_model.predict(x_test_vectorized)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(x_train_vectorized, y_train)

# Evaluate the model
y_pred = nb_model.predict(x_test_vectorized)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(x_train_vectorized, y_train)

# Evaluate the model
y_pred_logistic = logistic_model.predict(x_test_vectorized)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print(classification_report(y_test, y_pred_logistic, target_names=label_encoder.classes_))

import joblib

# Save the vectorizer and models
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(logistic_model, 'logistic_regression_model.pkl')

   verse_id  book_id  chapter  verse  \
0   1001001        1        1      1   
1   1001002        1        1      2   
2   1001003        1        1      3   
3   1001004        1        1      4   
4   1001005        1        1      5   

                                                text book_name testament  \
0  In the beginning God created the heaven and th...   Genesis        OT   
1  And the earth was without form, and void; and ...   Genesis        OT   
2  And God said, Let there be light: and there wa...   Genesis        OT   
3  And God saw the light, that it was good: and G...   Genesis        OT   
4  And God called the light Day, and the darkness...   Genesis        OT   

   genre  
0      1  
1      1  
2      1  
3      1  
4      1  


Naive Bayes Accuracy: 0.4648770294164925
                 precision    recall  f1-score   support

   1 Chronicles       0.50      0.53      0.52       189
  1 Corinthians       0.35      0.31      0.33        84
         1 John       0.67      0.09      0.16        22
        1 Kings       0.52      0.37      0.43       169
        1 Peter       0.00      0.00      0.00        20
       1 Samuel       0.60      0.68      0.64       175
1 Thessalonians       0.00      0.00      0.00        10
      1 Timothy       0.50      0.04      0.07        25
   2 Chronicles       0.40      0.45      0.42       167
  2 Corinthians       0.58      0.23      0.33        47
         2 John       0.00      0.00      0.00         2
        2 Kings       0.43      0.45      0.44       140
        2 Peter       0.00      0.00      0.00        10
       2 Samuel       0.54      0.40      0.46       153
2 Thessalonians       0.00      0.00      0.00         6
      2 Timothy       0.00      0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Accuracy: 0.48963189197878154
                 precision    recall  f1-score   support

   1 Chronicles       0.44      0.54      0.49       189
  1 Corinthians       0.34      0.36      0.35        84
         1 John       0.20      0.09      0.12        22
        1 Kings       0.51      0.41      0.45       169
        1 Peter       0.00      0.00      0.00        20
       1 Samuel       0.71      0.65      0.68       175
1 Thessalonians       0.12      0.10      0.11        10
      1 Timothy       0.67      0.24      0.35        25
   2 Chronicles       0.42      0.40      0.41       167
  2 Corinthians       0.44      0.34      0.39        47
         2 John       0.00      0.00      0.00         2
        2 Kings       0.54      0.46      0.50       140
        2 Peter       0.50      0.10      0.17        10
       2 Samuel       0.53      0.41      0.46       153
2 Thessalonians       0.00      0.00      0.00         6
      2 Timothy       0.50      0.05 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['logistic_regression_model.pkl']