In [83]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB


In [153]:
#put the data together

#get file in
kjv = None
with open('data/kjv.txt', 'r') as file:
    # Read the entire file content
    kjv = file.read()
kjv = kjv.lower()

#preprocess
#remove first 2 lines
kjv_verses = kjv.splitlines()
kjv_verses = kjv_verses[2:]

#create DataFrame with labels
verses_data = pd.DataFrame({'text': kjv_verses})
first_new_testament_verse = int(23145)
verses_data['new_testament'] = np.concatenate((np.zeros(first_new_testament_verse, dtype=np.int8), np.ones(len(kjv_verses) - first_new_testament_verse, dtype=np.int8)))

#remove the Book ch:verse combo
pattern = r"^.*?\d+:\d+."
for i, row in verses_data.iterrows():
    verses_data.iloc[i,0] = (re.sub(pattern, "", row.iloc[0]))



In [154]:
verses_data.iloc[first_new_testament_verse-1:, :]

Unnamed: 0,text,new_testament
23144,and he shall turn the heart of the fathers to ...,0
23145,"the book of the generation of jesus christ, th...",1
23146,abraham begat isaac; and isaac begat jacob; an...,1
23147,and judas begat phares and zara of thamar; and...,1
23148,and aram begat aminadab; and aminadab begat na...,1
...,...,...
31097,"and the spirit and the bride say, come. and le...",1
31098,for i testify unto every man that heareth the ...,1
31099,and if any man shall take away from the words ...,1
31100,"he which testifieth these things saith, surely...",1


In [155]:
verses_data.shape

(31102, 2)

In [159]:
#split by train and test data
X_train, X_test, y_train, y_test = train_test_split(verses_data["text"], verses_data["new_testament"],
test_size=0.33, random_state=69420)


In [160]:
#countVectorizer train & fit
count_vectorizer = CountVectorizer(stop_words="english")
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
#print(count_vectorizer.get_feature_names()[:10])

nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

pred = nb_classifier.predict(count_test)

score = metrics.accuracy_score(y_test, pred)
print(score)

cm = metrics.confusion_matrix(y_test, pred, labels = [0, 1])
print(cm)

0.9217653936087296
[[7277  324]
 [ 479 2184]]


In [166]:
count_train.shape

(20838, 10723)

In [161]:
#tfidf train & fit
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
#print(tfidf_vectorizer.get_feature_names()[:10])
#print(tfidf_train.A[:1].shape)

nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)

score = metrics.accuracy_score(y_test, pred)
print(score)
cm = metrics.confusion_matrix(y_test, pred, labels = [0, 1])
print(cm)


0.890003897116134
[[7572   29]
 [1100 1563]]


In [167]:
tfidf_train.shape

(20838, 10723)