In [1]:
import json
import tensorflow as tf

import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import io
import re

In [3]:
def json_to_dict(file):
    diction= open(file)
    return json.load(diction)

def plain_to_dict(file):
    f= open(file, encoding="utf8")
    return json.load(f)

#Geology
geo1= json_to_dict('geologic_processes.json')
geo2= plain_to_dict('age_of_earth')
#Paleontolgy
paleo= plain_to_dict('paleontology.json')
#Cosmology
cosmo= plain_to_dict('cosmology')
#Ecology
eco= plain_to_dict('ecology_conservation')
#Faith and science
faith= plain_to_dict('faith_and_science')
#Intelligent desgin
design= plain_to_dict('inteligentDesign.json')
#Biology
bio= plain_to_dict('Biology.json')
#Other sciences

#Join Geo
geo= {"geology": list(geo1.values())[0]+list(geo2.values())[0]}

In [19]:
def clean_text(rgx_list, text):
    new_text = text.lower()
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)   
    return new_text

In [68]:
all_arts= []
all_labels= []

for count,top in enumerate([geo,paleo,cosmo,eco,faith,design]):
    arts= []

    for art in list(top.values())[0]:#[:30]:
        arts.append(art['author'])
    labels= [count]*len(arts)
    
    all_arts.append(arts)
    all_labels.append(labels)
    #print(arts)
    #print(labels)


all_arts= [item for sublist in all_arts for item in sublist]
all_labels= [item for sublist in all_labels for item in sublist]

patterns= [#r'[\t\n\r\f\v\d]', r'(.+doi.+(Summary\. ?|\n)|.+DOI.+(Summary\. ?|\n))',
           #r'\[[\w\.;,\- ]+\]', r'\([\w\.;,\- ]+\)', r'“[\w+]”', r'[\w+]”', r'“[\w+]',
           #r'[F|f]ig[\.\w]+ [\w,\- ]+|see [F|f]ig[\.\w]+ [\w,\- ]+', r'[\"]',
           #r'[.,\/#!$%\^&\*;:{}=\-\[\]_`~()“”]', r'[\']',r'[\’]',
           #r'WHAT THIS ARTICLE IS ABOUT|ABSTRACT|INTRODUCTION|ACKNOWLEDGMENTS?|CONCLUSIONS?|Conclusions?|SUMMARY|DISCUSSION|Geoscience Research Institute',
           #r'\bthe\b',r'\bof\b',r'\bin\b',r'\bare\b',r'\bthe[m|y]\b',r'\bi(s|f|t)\b',r'\ba(n|ll|nd|t)?\b',r'\bto\b',r'\bwhich\b',r'\bby\b',
           #r'\bf?or\b',r'\bthose\b',r'\bfrom\b',r'\bto\b',r'\bbut\b',r'\bthese\b',r'>',r'\bwith\b',r'\xa0'
            r'[\w]+ ?[\w\.]? '
            ]

all_cleaned= []
for art in all_arts:
    #art= re.split('ENDNOTES|Endnotes|REFERENCES|References|Footnotes|FOR FURTHER STUDY|LITERATURE CITED', art)[0]
    #art= clean_text(patterns, art)
    all_cleaned.append(clean_text(patterns, art))

In [69]:
"""rt= 'Timothy G. Standish'
#new_text = re.sub(r'[\w]+ \w\. ', '', rt)
new_text = re.sub(r'[\w]+ ?[\w\.]? ', '', rt)
print(new_text)"""

"rt= 'Timothy G. Standish'\n#new_text = re.sub(r'[\\w]+ \\w\\. ', '', rt)\nnew_text = re.sub(r'[\\w]+ ?[\\w\\.]? ', '', rt)\nprint(new_text)"

In [70]:
#Sets
x_train, x_test, y_train, y_test= train_test_split(all_cleaned, all_labels, test_size=0.25)

training_labels= np.array(y_train)
testing_labels= np.array(y_test)

In [77]:
#Variables
#vocab_size= 642
vocab_size= 75
max_length= 32
trunc_type= "post"
padding_type= "post"
oov_tok= "<OOV>"

#Tokenize
tokenizer= Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index= tokenizer.word_index

#Train sentences
training_sequences= tokenizer.texts_to_sequences(x_train)
training_padded= pad_sequences(training_sequences, maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

#Test sentences
testing_sequences= tokenizer.texts_to_sequences(x_test)
testing_padded= pad_sequences(testing_sequences, maxlen=max_length, 
                              padding=padding_type, truncating=trunc_type)

In [78]:
embedding_dim= 16

#Keras model
model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 32, 16)            1200      
_________________________________________________________________
global_average_pooling1d_7 ( (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 24)                408       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 25        
Total params: 1,633
Trainable params: 1,633
Non-trainable params: 0
_________________________________________________________________


In [79]:
#Train
num_epochs= 30

history= model.fit(training_padded, training_labels, epochs=num_epochs,
                   validation_data=(testing_padded, testing_labels), verbose=2)

Train on 300 samples, validate on 101 samples
Epoch 1/30
300/300 - 1s - loss: 0.6079 - accuracy: 0.1533 - val_loss: 0.5083 - val_accuracy: 0.1485
Epoch 2/30
300/300 - 0s - loss: 0.4485 - accuracy: 0.1533 - val_loss: 0.3397 - val_accuracy: 0.1485
Epoch 3/30
300/300 - 0s - loss: 0.2690 - accuracy: 0.1533 - val_loss: 0.1329 - val_accuracy: 0.1485
Epoch 4/30
300/300 - 0s - loss: 0.0566 - accuracy: 0.1533 - val_loss: -1.0910e-01 - val_accuracy: 0.1485
Epoch 5/30
300/300 - 0s - loss: -1.9450e-01 - accuracy: 0.1533 - val_loss: -3.9325e-01 - val_accuracy: 0.1485
Epoch 6/30
300/300 - 0s - loss: -4.9019e-01 - accuracy: 0.1533 - val_loss: -7.3983e-01 - val_accuracy: 0.1485
Epoch 7/30
300/300 - 0s - loss: -8.5322e-01 - accuracy: 0.1533 - val_loss: -1.1544e+00 - val_accuracy: 0.1485
Epoch 8/30
300/300 - 0s - loss: -1.2852e+00 - accuracy: 0.1533 - val_loss: -1.6412e+00 - val_accuracy: 0.1485
Epoch 9/30
300/300 - 0s - loss: -1.7764e+00 - accuracy: 0.1533 - val_loss: -2.2088e+00 - val_accuracy: 0.1485

In [80]:
e= model.layers[0]
weights= e.get_weights()[0]
print(weights.shape)

(75, 16)


In [81]:
rev_vocab= dict([(value,key) for (key,value) in word_index.items()])
print(rev_vocab)

{1: '<OOV>', 2: 'roth', 3: 'brown', 4: 'gibson', 5: 'standish', 6: 'nalin', 7: 'ching', 8: 'clausen', 9: 'brand', 10: 'tkachuck', 11: 'esperante', 12: 'giem', 13: 'hasel', 14: 'jr', 15: 'shea', 16: 'davidson', 17: 'bergman', 18: 'chadwick', 19: 'zuill', 20: 'baldwin', 21: 'coffin', 22: 'suzuki', 23: 'johns', 24: 'wolfe', 25: 'guliuzza', 26: 'javor', 27: 'schafer', 28: 'doukhan', 29: 'rodríguez', 30: 'fraser', 31: 'neufeld', 32: 'kootsey', 33: 'sciarabba', 34: 'phillips', 35: 'mclain', 36: 'naledi', 37: 'frair', 38: 'wheeler', 39: 'kissinger', 40: 'biaggi', 41: 'smith', 42: 'hart', 43: 'younker', 44: 'clark', 45: 'ford', 46: 'miller', 47: 'graham', 48: 'kennedy', 49: 'aagaard', 50: 'camp', 51: 'steger', 52: 'gregor', 53: 'burdick', 54: 'wise', 55: 'snelling', 56: 'klingbeil', 57: 'lugeneal', 58: 'mccluskey', 59: 'price', 60: 'kotulla', 61: 'silva', 62: 'hayes', 63: 'dwyer', 64: 'marsh', 65: 'wood', 66: 'brandstater', 67: 'duran', 68: 'carter', 69: 'ritland', 70: 'boyle', 71: 'schoepflin

In [83]:
#TSV
out_v= io.open('vecs-author.tsv', 'w', encoding='utf-8')
out_m= io.open('meta-author.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word= rev_vocab[word_num]
    embeddings= weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()