In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

In [2]:
def bible_version_information():
    key = pd.read_csv(r'bible/bible_version_key.csv')
    return display(key[["abbreviation", "version"]])

def read_bible(version="KJV"):
    if version == "darby":
        return pd.read_csv(r'bible/t_dby.csv', encoding='latin1')
    return pd.read_csv(r'bible/t_'+version+'.csv')

def display_bible(version):
    bible_data = join_keys(read_bible(version))
    return bible_data.head()

def join_keys(bible_data):
    key_english = pd.read_csv(r'bible/key_english.csv')
    key_genre_english = pd.read_csv(r'bible/key_genre_english.csv')
    bible = pd.merge(bible_data, key_english, on=['b'])
    bible = pd.merge(bible, key_genre_english, on=['g'])
    bible.columns = [
    "id", "book_number", "chapter_number", "verse_number", 
     "verse_text", "book_name", "testament", "genre_number", "genre_name"]
    return bible

def flatten_list(input_list):
    flat_list = []
    for sublist in input_list:
        for item in sublist:
            flat_list.append(item)
    return flat_list

def prepare_data(bible):
    #genre_name_list = bible.groupby(["book_number"]).max()["genre_name"]
    book_list = []
    for book in bible["book_name"].unique():
        temp_data = bible[bible["book_name"]==book]
        temp_list = []
        for chapter in temp_data["chapter_number"].unique():
            temp_chapters = temp_data[temp_data["chapter_number"]==chapter]
            temp = list(temp_data[temp_data["chapter_number"]==chapter]["verse_text"])
            temp = " ".join(temp)
            temp_list.append(temp)
        book_list.append(temp_list)
    chapters_list = []
    for item in book_list:
        chapters_list.append(" ".join(item))
    prepared_data = pd.DataFrame(pd.Series(chapters_list), columns = ["text"])
    print("Prepared data:")
    display(prepared_data.head())
    return prepared_data

def preprocess_data(input_data):
    cv = CountVectorizer(max_df=0.95, min_df=10, stop_words='english')
    dtm = cv.fit_transform(input_data['text'])
    return dtm, cv

def choose_components():
    print("Please choose the number of components. Enter 0 to choose default value.")
    components = int(input())
    if components == 0:
        components = 8
    return components

def perform_lda(dtm):
    components = choose_components()
    print("Now performing LDA. This may take a couple of minutes. Please wait.")
    LDA = LatentDirichletAllocation(n_components=8,random_state=42)
    LDA.fit(dtm)
    return LDA
    

In [3]:
bible_version_information()

Unnamed: 0,abbreviation,version
0,ASV,American Standard-ASV1901
1,BBE,Bible in Basic English
2,DARBY,Darby English Bible
3,KJV,King James Version
4,WBT,Webster's Bible
5,WEB,World English Bible
6,YLT,Young's Literal Translation


In [4]:
def lda_function():
    print("Please choose a version:")
    version = input()
    bible_data = join_keys(read_bible(version))
    input_data = prepare_data(bible_data)
    dtm, cv = preprocess_data(input_data)
    lda = perform_lda(dtm)
    return cv, lda, dtm, input_data

In [5]:
cv, LDA, dtm, input_data = lda_function()

Please choose a version:
KJV
Prepared data:


Unnamed: 0,text
0,In the beginning God created the heaven and th...
1,Now these are the names of the children of Isr...
2,"And the LORD called unto Moses, and spake unto..."
3,And the LORD spake unto Moses in the wildernes...
4,These be the words which Moses spake unto all ...


Please choose the number of components. Enter 0 to choose default value.
0
Now performing LDA. This may take a couple of minutes. Please wait.


## Showing Stored Words

In [6]:
len(cv.get_feature_names())

1946

In [7]:
for i in range(10):
    random_word_id = random.randint(0,len(cv.get_feature_names()))
    print(cv.get_feature_names()[random_word_id])

famine
cedars
ways
summer
prophesied
sorrows
paths
present
furthermore
multiply


In [8]:
for i in range(10):
    random_word_id = random.randint(0,len(cv.get_feature_names()))
    print(cv.get_feature_names()[random_word_id])

nay
solemn
raised
glorified
continue
assyria
son
levi
appoint
humble


### Showing Top Words Per Topic

In [9]:
len(LDA.components_)

8

In [10]:
LDA.components_

array([[1.25193420e-01, 1.25027620e-01, 1.25041156e-01, ...,
        1.25032848e-01, 1.25031671e-01, 1.25014325e-01],
       [1.09114091e+01, 3.71304557e+00, 6.86367399e+00, ...,
        1.48284824e+00, 1.25158677e-01, 1.26535941e+02],
       [1.25027767e-01, 3.03752170e+00, 1.28794580e+00, ...,
        4.15272766e+00, 1.17305180e+01, 2.58679002e+01],
       ...,
       [7.97814537e+00, 3.87368065e+00, 1.83608483e+00, ...,
        4.64545118e+00, 1.25047600e-01, 1.25019765e-01],
       [1.25003074e-01, 4.87510425e+00, 1.91003562e+00, ...,
        1.25001617e-01, 1.25009217e-01, 1.25008473e-01],
       [1.25201709e-01, 1.25502361e-01, 1.25004600e-01, ...,
        1.25006500e-01, 1.25011925e-01, 1.25011078e-01]])

In [11]:
len(LDA.components_[0])

1946

In [12]:
single_topic = LDA.components_[0]

In [13]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([ 997, 1172,  739, ..., 1708, 1933, 1416], dtype=int64)

In [14]:
# Word least representative of this topic
single_topic[single_topic.argsort()[0]]

0.12500060696406734

In [15]:
# Word most representative of this topic
single_topic[single_topic.argsort().max()]

0.12501432484445793

In [16]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([1438, 1694,  575, 1440,  226, 1029,  888, 1708, 1933, 1416],
      dtype=int64)

In [17]:
top_word_indices = single_topic.argsort()[-20:]

In [18]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

sent
behold
day
saw
answered
saith
things
thy
went
son
say
thee
father
saying
came
man
jesus
thou
ye
said


These look like business articles perhaps... Let's confirm by using .transform() on our vectorized articles to attach a label number. But first, let's view all the 10 topics found.

In [19]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-35:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['world', 'kingdom', 'house', 'people', 'spake', 'cast', 'did', 'away', 'heaven', 'forth', 'took', 'know', 'lord', 'heard', 'called', 'sent', 'behold', 'day', 'saw', 'answered', 'saith', 'things', 'thy', 'went', 'son', 'say', 'thee', 'father', 'saying', 'came', 'man', 'jesus', 'thou', 'ye', 'said']


THE TOP 15 WORDS FOR TOPIC #1
['israel', 'mouth', 'glory', 'hear', 'children', 'away', 'wicked', 'zion', 'forth', 'enemies', 'great', 'fear', 'yea', 'strength', 'art', 'righteousness', 'mercy', 'day', 'make', 'man', 'thine', 'hand', 'praise', 'soul', 'heart', 'like', 'earth', 'ye', 'people', 'hast', 'let', 'thee', 'thou', 'thy', 'lord']


THE TOP 15 WORDS FOR TOPIC #2
['city', 'pass', 'hast', 'egypt', 'bring', 'hosts', 'word', 'babylon', 'make', 'forth', 'earth', 'jerusalem', 'say', 'came', 'hand', 'shalt', 'saying', 'thereof', 'judah', 'said', 'son', 'day', 'king', 'man', 'people', 'house', 'behold', 'israel', 'land', 'thy', 'ye', 'thou', 'saith', 'thee', 'lo

### Attaching Discovered Topic Labels to Original Articles

In [20]:
dtm

<66x1946 sparse matrix of type '<class 'numpy.int64'>'
	with 41260 stored elements in Compressed Sparse Row format>

In [21]:
dtm.shape

(66, 1946)

In [22]:
len(input_data)

66

In [23]:
topic_results = LDA.transform(dtm)

In [24]:
topic_results.shape

(66, 8)

In [25]:
topic_results[0]

array([2.05185159e-01, 1.00429356e-05, 1.93415541e-01, 1.00435749e-05,
       3.53636784e-01, 2.47722343e-01, 1.00441987e-05, 1.00416118e-05])

In [26]:
topic_results[0].round(2)

array([0.21, 0.  , 0.19, 0.  , 0.35, 0.25, 0.  , 0.  ])

In [27]:
topic_results[0].argmax()

4

This means that our model thinks that the first article belongs to topic #1.

### Combining with Original Data

In [28]:
input_data.head()

Unnamed: 0,text
0,In the beginning God created the heaven and th...
1,Now these are the names of the children of Isr...
2,"And the LORD called unto Moses, and spake unto..."
3,And the LORD spake unto Moses in the wildernes...
4,These be the words which Moses spake unto all ...


In [29]:
topic_results.argmax(axis=1)

array([4, 5, 3, 3, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 1, 6, 6, 1,
       1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7],
      dtype=int64)

In [30]:
len(topic_results.argmax(axis=1))

66

In [31]:
input_data.shape

(66, 1)

In [32]:
input_data['Topic'] = topic_results.argmax(axis=1)

In [37]:
input_data.head(10)

Unnamed: 0,text,Topic
1,In the beginning God created the heaven and th...,4
2,Now these are the names of the children of Isr...,5
3,"And the LORD called unto Moses, and spake unto...",3
4,And the LORD spake unto Moses in the wildernes...,3
5,These be the words which Moses spake unto all ...,5
6,Now after the death of Moses the servant of th...,4
7,"Now after the death of Joshua it came to pass,...",4
8,Now it came to pass in the days when the judge...,4
9,Now there was a certain man of Ramathaimzophim...,4
10,"Now it came to pass after the death of Saul, w...",4
