In [39]:
import pandas as pd
import nltk
import numpy as np
import logging
from gensim.models import word2vec

In [66]:
verse = pd.read_csv('./bible/t_asv.csv', index_col='id', 
                    dtype={'id': np.int64, 'b': np.int32, 'c': np.int32, 'v': np.int32, 't': object})

verse.head()


Unnamed: 0_level_0,b,c,v,t
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001001,1,1,1,In the beginning God created the heavens and t...
1001002,1,1,2,And the earth was waste and void; and darkness...
1001003,1,1,3,"And God said, Let there be light: and there wa..."
1001004,1,1,4,"And God saw the light, that it was good: and G..."
1001005,1,1,5,"And God called the light Day, and the darkness..."


In [67]:
verse = verse["t"].values
verse

array(['In the beginning God created the heavens and the earth.',
       'And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.',
       'And God said, Let there be light: and there was light.', ...,
       'and if any man shall take away from the words of the book of this prophecy, God shall take away his part from the tree of life, and out of the holy city, which are written in this book.',
       'He who testifieth these things saith, Yea: I come quickly. Amen: come, Lord Jesus.',
       'The grace of the Lord Jesus be with the saints. Amen.'],
      dtype=object)

In [68]:
verse_vec = [nltk.word_tokenize(title) for title in verse]

In [82]:
def in_list(item,L):
    for i in L:
        if item in i:
            return L.index(i)
    return -1
p = in_list('In', verse_vec)
p

0

In [83]:
verse_vec = [[w.lower() for w in line] for line in verse_vec]

In [126]:
verse_vec[:2]

[['in',
  'the',
  'beginning',
  'god',
  'created',
  'the',
  'heavens',
  'and',
  'the',
  'earth',
  '.'],
 ['and',
  'the',
  'earth',
  'was',
  'waste',
  'and',
  'void',
  ';',
  'and',
  'darkness',
  'was',
  'upon',
  'the',
  'face',
  'of',
  'the',
  'deep',
  ':',
  'and',
  'the',
  'spirit',
  'of',
  'god',
  'moved',
  'upon',
  'the',
  'face',
  'of',
  'the',
  'waters',
  '.']]

In [114]:
#
# training word2vec model
#



logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 5000    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

bible = verse_vec

model = word2vec.Word2Vec(bible, workers=num_workers, \
                        vector_size=num_features, min_count = min_word_count, \
                        window = context, sample = downsampling)
model.init_sims(replace=True)
model.save("bible_word2vec.model")

2022-05-09 20:12:24,685 : INFO : collecting all words and their counts
2022-05-09 20:12:24,686 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-05-09 20:12:24,756 : INFO : PROGRESS: at sentence #10000, processed 324578 words, keeping 6659 word types
2022-05-09 20:12:24,810 : INFO : PROGRESS: at sentence #20000, processed 598525 words, keeping 10243 word types
2022-05-09 20:12:24,854 : INFO : PROGRESS: at sentence #30000, processed 886820 words, keeping 12616 word types
2022-05-09 20:12:24,863 : INFO : collected 12860 word types from a corpus of 919886 raw words and 31103 sentences
2022-05-09 20:12:24,865 : INFO : Creating a fresh vocabulary
2022-05-09 20:12:24,893 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 3647 unique words (28.359253499222394%% of original 12860, drops 9213)', 'datetime': '2022-05-09T20:12:24.893363', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platfor

2022-05-09 20:13:05,446 : INFO : EPOCH 4 - PROGRESS: at 54.20% examples, 42835 words/s, in_qsize 7, out_qsize 0
2022-05-09 20:13:06,535 : INFO : EPOCH 4 - PROGRESS: at 64.35% examples, 44484 words/s, in_qsize 7, out_qsize 0
2022-05-09 20:13:07,801 : INFO : EPOCH 4 - PROGRESS: at 72.66% examples, 44228 words/s, in_qsize 7, out_qsize 0
2022-05-09 20:13:08,880 : INFO : EPOCH 4 - PROGRESS: at 85.75% examples, 46039 words/s, in_qsize 7, out_qsize 0
2022-05-09 20:13:09,895 : INFO : EPOCH 4 - PROGRESS: at 92.85% examples, 45143 words/s, in_qsize 7, out_qsize 0
2022-05-09 20:13:10,347 : INFO : worker thread finished; awaiting finish of 3 more threads
2022-05-09 20:13:10,354 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-05-09 20:13:10,391 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-05-09 20:13:10,439 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-05-09 20:13:10,440 : INFO : EPOCH - 4 : training on 919886 raw words (5

In [115]:
model.wv.most_similar("holy") 

[('most', 0.6912201642990112),
 ('glory', 0.6666838526725769),
 ('established', 0.649179220199585),
 ('sacrifice', 0.6401773691177368),
 ('ever', 0.6295251846313477),
 ('hosts', 0.6253499984741211),
 ('everlasting', 0.6247296333312988),
 ('praise', 0.6096181869506836),
 ('statute', 0.6056853532791138),
 ('lacking', 0.5790976881980896)]

In [116]:
model.wv.most_similar("women") 

[('herds', 0.7508175373077393),
 ('few', 0.742457389831543),
 ('provinces', 0.7401953339576721),
 ('persons', 0.7302664518356323),
 ('flocks', 0.7294304966926575),
 ('virgins', 0.7183959484100342),
 ('valor', 0.7167403697967529),
 ('men', 0.713136613368988),
 ('ethiopians', 0.6980056762695312),
 ('higher', 0.690645694732666)]

In [117]:
model.wv.most_similar("light") 

[('darkness', 0.8008269667625427),
 ('fruit', 0.7168245315551758),
 ('moon', 0.7095617055892944),
 ('rain', 0.7095038890838623),
 ('paths', 0.6709146499633789),
 ('dew', 0.6701098680496216),
 ('shadow', 0.6497399806976318),
 ('just', 0.6442381739616394),
 ('heavens', 0.642227053642273),
 ('sown', 0.6301502585411072)]

In [118]:
model.wv.most_similar("jesus") 

[('john', 0.7623591423034668),
 ('peter', 0.7215086221694946),
 ('elijah', 0.7126917839050293),
 ('paul', 0.6932657957077026),
 ('isaiah', 0.6793155670166016),
 ('lord', 0.678869903087616),
 ('samuel', 0.6737868785858154),
 ('prophet', 0.6701518297195435),
 ('answered', 0.6530582904815674),
 ('daniel', 0.6504835486412048)]

In [119]:
express =  model.wv['jesus'] - model.wv['man'] + model.wv['woman']
model.wv.most_similar([express]) 

[('jesus', 0.7337614893913269),
 ('esther', 0.6782093048095703),
 ('peter', 0.6725742816925049),
 ('john', 0.6639114618301392),
 ('elijah', 0.6300449967384338),
 ('answered', 0.6278935670852661),
 ('angel', 0.6250754594802856),
 ('mordecai', 0.6212992668151855),
 ('well', 0.6045417785644531),
 ('queen', 0.5918430089950562)]

In [120]:
express =  model.wv['money'] - model.wv['evil'] + model.wv['good']
model.wv.most_similar([express]) 

[('money', 0.8048887848854065),
 ('sojourner', 0.7585711479187012),
 ('fatherless', 0.706995964050293),
 ('bottle', 0.7062492966651917),
 ('friend', 0.7016578912734985),
 ('wine', 0.6925674676895142),
 ('raiment', 0.6878551840782166),
 ('estimation', 0.68038010597229),
 ('firstling', 0.6701173782348633),
 ('fair', 0.6689985990524292)]

In [121]:
express =  model.wv['king'] - model.wv['queen'] + model.wv['he']
model.wv.most_similar([express]) 


[('he', 0.7677114605903625),
 ('him', 0.534227192401886),
 ('himself', 0.5123739838600159),
 ('king', 0.46371322870254517),
 ('death', 0.41993841528892517),
 ('man', 0.4152510464191437),
 ('pharaoh', 0.38624608516693115),
 ('hand', 0.37442007660865784),
 ('jehovah', 0.3670368492603302),
 ('his', 0.36421963572502136)]