# Predicting Bible passage and authors using Machine learning

Dataset: Bible KJV

In [23]:
import pandas as pd
import joblib

In [1]:
from sklearn.feature_extraction.text import CountVectorizer #convert text into numbers

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #get accuracy of model

In [12]:
df= pd.read_csv("kjvdata.csv")

df

Unnamed: 0.1,Unnamed: 0,id,book,chapter,verse,text
0,0,1001001,Genesis,1,1,In the beginning God created the heaven and th...
1,1,1001002,Genesis,1,2,"And the earth was without form, and void; and ..."
2,2,1001003,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,3,1001004,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,4,1001005,Genesis,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...,...,...
31098,31098,66022017,Revelation,22,17,"And the Spirit and the bride say, Come. And le..."
31099,31099,66022018,Revelation,22,18,For I testify unto every man that heareth the ...
31100,31100,66022019,Revelation,22,19,And if any man shall take away from the words ...
31101,31101,66022020,Revelation,22,20,"He which testifieth these things saith, Surely..."


In [9]:
# The list of authors

author_list = {"Genesis": "Moses",
"Exodus": "Moses",
"Leviticus": "Moses",
"Numbers": "Moses",
"Deuteronomy": "Moses",
"Joshua": "Joshua",
"Judges": "Samuel, Nathan, Gad",
"Ruth": "Samuel, Nathan, Gad",
"1 Samuel (1 Kings)": "Samuel, Nathan, Gad",
"2 Samuel (2 Kings)": "Samuel, Nathan, Gad",
"1 Kings (3 Kings)": "Jeremiah",
"2 Kings (4 Kings)": "Jeremiah",
"1 Chronicles": "Ezra",
"2 Chronicles": "Ezra",
"Ezra": "Ezra",
"Nehemiah": "Nehemiah, Ezra",
"Esther": "Mordecai",
"Job": "Job,Moses",
"Psalms": "David,Asaph, Ezra, the sons of Korah, Heman, Ethan, Moses",
"Proverbs": "Solomon ,Agur(30) and Lemuel(31)",
"Ecclesiastes": "Solomon",
"Song of Solomon (Canticles)": "Solomon",
"Isaiah": "Isaiah",
"Jeremiah": "Jeremiah",
"Lamentations": "Jeremiah",
"Ezekiel": "Ezekiel",
"Daniel": "Daniel",
"Hosea": "Hosea",
"Joel": "Joel",
"Amos": "Amos",
"Obadiah": "Obadiah",
"Jonah": "Jonah",
"Micah": "Micah",
"Nahum": "Nahum",
"Habakkuk": "Habakkuk",
"Zephaniah": "Zephaniah",
"Haggai": "Haggai",
"Zechariah": " Zechariah",
"Malachi": "Malachi",
"Matthew": "Matthew",
"Mark": "John Mark",
"Luke": "Luke",
"John": "John, the Apostle",
"Acts": "Luke",
"Romans": "Paul",
"1 Corinthians": "Paul",
"2 Corinthians": "Paul",
"Galatians": "Paul",
"Ephesians": "Paul",
"Philippians": "Paul",
"Colossians": "Paul",
"1 Thessalonians": "Paul",
"2 Thessalonians": "Paul",
"1 Timothy": "Paul",
"2 Timothy": "Paul",
"Titus": "Paul",
"Philemon": "Paul",
"Hebrews": "Paul, Luke, Barnabas, Apollos",
"James": "James the brother of Jesus and Jude (not the Apostle, brother of John).",
"1 Peter": "Peter",
"2 Peter": "Peter",
"1 John": "John, the Apostle",
"2 John": "John, the Apostle",
"3 John": "John, the Apostle",
"Jude": "Jude, the brother of Jesus",
"Revelation": "John, the Apostle"}

In [13]:
# maps books to authors
df['authors']= df['book'].map(author_list)

In [18]:
df_new= df

In [21]:
df_new.loc[23144]

Unnamed: 0                                                23144
id                                                     39004006
book                                                    Malachi
chapter                                                       4
verse                                                         6
text          And he shall turn the heart of the fathers to ...
authors                                                 Malachi
label                                                         0
Name: 23144, dtype: object

In [None]:
# Label the old testament and the new testaments
# 23144 = OT

In [20]:
# old testament
df_new.loc[0:23144, 'label']=0

# new testament
df_new.loc[23145:, 'label']= 1

In [22]:
# Create feautures

Xfeatures = df_new['text']
ylabel =df_new['authors']

# Scikit-learn’s CountVectorizer is used to convert a collection of text documents to a vector of term/token counts

In [26]:
# Vectorization to turn text to vectors
cv = CountVectorizer()
X= cv.fit_transform(Xfeatures)

In [27]:
#  save vectorizers 
bible_author_vectorizer= open("bible_author_vectorizer.pkl", "wb") #A PKL file is a file created by pickle, a Python module that enabless objects to be serialized to files on disk 

joblib.dump(cv, bible_author_vectorizer)

In [30]:
bible_author_vectorizer.close()


In [29]:
 x_train, x_test, y_train, y_test= train_test_split(X, ylabel, test_size= 0.33, random_state=42)

In [31]:
# Model building for naive bayes

nb= MultinomialNB()

nb_model= nb.fit(x_train, y_train)

In [None]:
# check the accuracy score