In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

Function to read the versions available.

In [2]:
def bible_version_information():
    key = pd.read_csv(r'bible/bible_version_key.csv')
    return display(key[["abbreviation", "version"]])

Function to read the version of Bible input by user

In [3]:
def read_bible(version="KJV"):
    if version == "darby":
        return pd.read_csv(r'bible/t_dby.csv', encoding='latin1')
    return pd.read_csv(r'bible/t_'+version+'.csv')

Function to display the versions available.

In [4]:
def display_bible(version):
    bible_data = join_keys(read_bible(version))
    return bible_data.head()

Joining tables to create one dataframe.

In [5]:
def join_keys(bible_data):
    key_english = pd.read_csv(r'bible/key_english.csv')
    key_genre_english = pd.read_csv(r'bible/key_genre_english.csv')
    bible = pd.merge(bible_data, key_english, on=['b'])
    bible = pd.merge(bible, key_genre_english, on=['g'])
    bible.columns = [
    "id", "book_number", "chapter_number", "verse_number", 
     "verse_text", "book_name", "testament", "genre_number", "genre_name"]
    return bible

Function to flatten a list.

In [6]:
def flatten_list(input_list):
    flat_list = []
    for sublist in input_list:
        for item in sublist:
            flat_list.append(item)
    return flat_list

Function to prepare the data

In [7]:
def prepare_data(bible):
    #genre_name_list = bible.groupby(["book_number"]).max()["genre_name"]
    book_list = []
    for book in bible["book_name"].unique():
        temp_data = bible[bible["book_name"]==book]
        temp_list = []
        for chapter in temp_data["chapter_number"].unique():
            temp_chapters = temp_data[temp_data["chapter_number"]==chapter]
            temp = list(temp_data[temp_data["chapter_number"]==chapter]["verse_text"])
            temp = " ".join(temp)
            temp_list.append(temp)
        book_list.append(temp_list)
    chapters_list = []
    for item in book_list:
        chapters_list.append(" ".join(item))
    prepared_data = pd.DataFrame(pd.Series(chapters_list), columns = ["text"])
    print("Prepared data:")
    display(prepared_data.head())
    return prepared_data

Function to preprocess the data

In [8]:
def preprocess_data(input_data):
    cv = CountVectorizer(max_df=0.95, min_df=10, stop_words='english')
    dtm = cv.fit_transform(input_data['text'])
    return dtm, cv

Function to ask the user to input the number of topics to divide the books of the Bible into.

In [9]:
def choose_components():
    print("Please choose the number of components. Enter 0 to choose default value.")
    components = int(input())
    if components == 0:
        components = 8
    return components

Function to perform Latent Dirichlet Allocation with the number of components from the previous step

In [10]:
def perform_lda(dtm):
    components = choose_components()
    print("Now performing LDA. This may take a couple of minutes. Please wait.")
    LDA = LatentDirichletAllocation(n_components=8,random_state=42)
    LDA.fit(dtm)
    return LDA

Printing Bible versions available.

In [11]:
bible_version_information()

Unnamed: 0,abbreviation,version
0,ASV,American Standard-ASV1901
1,BBE,Bible in Basic English
2,DARBY,Darby English Bible
3,KJV,King James Version
4,WBT,Webster's Bible
5,WEB,World English Bible
6,YLT,Young's Literal Translation


This is the main function. It calls rest of the functions in the notebook.

In [12]:
def lda_function():
    print("Please choose a version:")
    version = input()
    bible_data = join_keys(read_bible(version))
    input_data = prepare_data(bible_data)
    dtm, cv = preprocess_data(input_data)
    lda = perform_lda(dtm)
    return cv, lda, dtm, input_data

In [14]:
cv, LDA, dtm, input_data = lda_function()

Please choose a version:
bbe
Prepared data:


Unnamed: 0,text
0,At the first God made the heaven and the earth...
1,Now these are the names of the sons of Israel ...
2,And the voice of the Lord came to Moses out of...
3,And the Lord said to Moses in the waste land o...
4,These are the words which Moses said to all Is...


Please choose the number of components. Enter 0 to choose default value.
7
Now performing LDA. This may take a couple of minutes. Please wait.


## Showing Stored Words

In [18]:
len(cv.get_feature_names())

1287

In [19]:
for i in range(10):
    random_word_id = random.randint(0,len(cv.get_feature_names()))
    print(cv.get_feature_names()[random_word_id])

bear
nose
discovery
sorts
conscious
right
hearted
mark
lord
rolling


### Showing Top Words Per Topic

In [20]:
for i in range(10):
    random_word_id = random.randint(0,len(cv.get_feature_names()))
    print(cv.get_feature_names()[random_word_id])

mark
women
present
decision
window
covers
square
building
needed
bag


In [21]:
len(LDA.components_)

8

In [22]:
LDA.components_

array([[1.78080698e-01, 2.70724758e+00, 1.36376041e+02, ...,
        1.53866025e+00, 1.25005057e-01, 1.30069207e+00],
       [1.25056115e-01, 1.09717196e+01, 2.39639223e+01, ...,
        7.05323455e+00, 3.53932061e+00, 1.25024137e-01],
       [3.31691885e+02, 8.48454053e-01, 6.42680282e+01, ...,
        1.73487949e+01, 1.25120002e-01, 1.25021462e-01],
       ...,
       [1.34681159e-01, 1.25027966e-01, 6.49793870e+01, ...,
        2.91279387e+00, 9.84402011e+00, 1.49868806e+02],
       [1.25017976e-01, 1.25144299e-01, 7.77479420e+01, ...,
        1.25083779e-01, 1.25004406e-01, 1.25090497e-01],
       [7.02847351e+00, 8.97195493e+00, 7.72951487e+01, ...,
        1.87709277e+01, 1.19914514e+01, 2.29917192e+00]])

In [23]:
single_topic = LDA.components_[0]

In [24]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([ 629,   65, 1160, ...,  543,  128,  915], dtype=int64)

In [25]:
top_word_indices = single_topic.argsort()[-20:]

In [26]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

answer
house
time
death
knowledge
saw
sent
father
saying
things
people
took
son
away
gave
man
went
jesus
came
said


In [27]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([ 775, 1128, 1008,   51,  416,  640, 1225,  543,  128,  915],
      dtype=int64)

These look like business articles perhaps... Let's confirm by using .transform() on our vectorized articles to attach a label number. But first, let's view all the 10 topics found.

In [28]:
for index,topic in enumerate(LDA.components_):
    print(f'The most common words for topic #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-35:]])
    print('\n')

The most common words for topic #0
['jews', 'life', 'spirit', 'faith', 'number', 'evil', 'dead', 'got', 'food', 'day', 'did', 'place', 'heaven', 'words', 'lord', 'answer', 'house', 'time', 'death', 'knowledge', 'saw', 'sent', 'father', 'saying', 'things', 'people', 'took', 'son', 'away', 'gave', 'man', 'went', 'jesus', 'came', 'said']


The most common words for topic #1
['brothers', 'sent', 'house', 'good', 'isaac', 'away', 'food', 'egypt', 'children', 'birth', 'saw', 'pharaoh', 'death', 'blessing', 'brother', 'place', 'years', 'servant', 'abraham', 'living', 'man', 'earth', 'joseph', 'wife', 'sons', 'son', 'took', 'jacob', 'lord', 'land', 'came', 'went', 'father', 'gave', 'said']


The most common words for topic #2
['food', 'meal', 'work', 'house', 'away', 'blood', 'gave', 'meeting', 'came', 'egypt', 'orders', 'altar', 'took', 'son', 'tent', 'burned', 'sons', 'unclean', 'place', 'sin', 'went', 'day', 'priest', 'death', 'aaron', 'man', 'holy', 'people', 'said', 'israel', 'land', 'chi

### Attaching Discovered Topic Labels to Original Articles

In [29]:
topic_results = LDA.transform(dtm)

In [30]:
topic_results.shape

(66, 8)

### Combining with Original Data

In [31]:
input_data.head()

Unnamed: 0,text
0,At the first God made the heaven and the earth...
1,Now these are the names of the sons of Israel ...
2,And the voice of the Lord came to Moses out of...
3,And the Lord said to Moses in the waste land o...
4,These are the words which Moses said to all Is...


In [32]:
topic_results.argmax(axis=1)

array([1, 2, 2, 2, 5, 7, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 4, 5, 5, 6, 6, 5,
       5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0,
       3, 6, 6, 3, 3, 3, 3, 3, 3, 6, 3, 6, 3, 3, 6, 6, 6, 6, 6, 6, 3, 4],
      dtype=int64)

In [33]:
len(topic_results.argmax(axis=1))

66

In [34]:
input_data.shape

(66, 1)

In [35]:
input_data['Topic'] = topic_results.argmax(axis=1)

The reult is stored in input_data. Here are the first few rows.

In [36]:
input_data.head(10)

Unnamed: 0,text,Topic
0,At the first God made the heaven and the earth...,1
1,Now these are the names of the sons of Israel ...,2
2,And the voice of the Lord came to Moses out of...,2
3,And the Lord said to Moses in the waste land o...,2
4,These are the words which Moses said to all Is...,5
5,"Now after the death of Moses, the servant of t...",7
6,"Now after the death of Joshua, the children of...",7
7,"Now there came a time, in the days of the judg...",1
8,"Now there was a certain man of Ramathaim, a Zu...",7
9,"Now after the death of Saul, when David, havin...",7
