In [1]:
import json
import tensorflow as tf

import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import io
import re

In [2]:
def json_to_dict(file):
    diction= open(file)
    return json.load(diction)

def plain_to_dict(file):
    f= open(file, encoding="utf8")
    return json.load(f)

#Geology
geo1= json_to_dict('geologic_processes.json')
geo2= plain_to_dict('age_of_earth')
#Paleontolgy
paleo= plain_to_dict('paleontology.json')
#Cosmology
cosmo= plain_to_dict('cosmology')
#Ecology
eco= plain_to_dict('ecology_conservation')
#Faith and science
faith= plain_to_dict('faith_and_science')
#Intelligent desgin
design= plain_to_dict('inteligentDesign.json')
#Biology
bio= json_to_dict('Biology.json')
#Other sciences

#Join Geo
geo= {"geology": list(geo1.values())[0]+list(geo2.values())[0]}

In [177]:
def clean_text(rgx_list, text):
    new_text = text.lower()
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)   
    return new_text

all_arts= []
all_labels= []

for count,top in enumerate([geo,paleo,cosmo,eco,faith,design]):
    arts= []

    for art in list(top.values())[0][:30]:
        arts.append(art['doc'])
    labels= [count]*len(arts)
    
    all_arts.append(arts)
    all_labels.append(labels)
    #print(arts)
    #print(labels)


all_arts= [item for sublist in all_arts for item in sublist]
all_labels= [item for sublist in all_labels for item in sublist]

patterns= [r'[\t\n\r\f\v\d]', r'(.+doi.+(Summary\. ?|\n)|.+DOI.+(Summary\. ?|\n))',
           r'\[[\w\.;,\- ]+\]', r'\([\w\.;,\- ]+\)', r'“[\w+]”', r'[\w+]”', r'“[\w+]',
           r'[F|f]ig[\.\w]+ [\w,\- ]+|see [F|f]ig[\.\w]+ [\w,\- ]+', r'[\"]',
           r'[.,\/#!$%\^&\*;:{}=\-\[\]_`~()“”]', r'[\']',r'[\’]',
           r'WHAT THIS ARTICLE IS ABOUT|ABSTRACT|INTRODUCTION|ACKNOWLEDGMENTS?|CONCLUSIONS?|Conclusions?|SUMMARY|DISCUSSION|Geoscience Research Institute',
           r'\bthe\b',r'\bof\b',r'\bin\b',r'\bare\b',r'\bthe[m|y]\b',r'\bi(s|f|t)\b',r'\ba(n|ll|nd|t)?\b',r'\bto\b',r'\bwhich\b',r'\bby\b',
           r'\bf?or\b',r'\bthose\b',r'\bfrom\b',r'\bto\b',r'\bbut\b',r'\bthese\b',r'>',r'\bwith\b',r'\xa0']

all_cleaned= []
for art in all_arts:
    art= re.split('ENDNOTES|Endnotes|REFERENCES|References|Footnotes|FOR FURTHER STUDY|LITERATURE CITED', art)[0]
    #art= clean_text(patterns, art)
    all_cleaned.append(clean_text(patterns, art))

In [184]:
re2= 'Dinosaaaur fossils at are them \"widespread \xa0all in they and Mesozoic a ladf inimaginable'
new_text= re.sub(r'[\"]', '', re2)
print(re2)
print(new_text)
#print(all_cleaned[0])

Dinosaaaur fossils at are them "widespread  all in they and Mesozoic a ladf inimaginable
Dinosaaaur fossils at are them widespread  all in they and Mesozoic a ladf inimaginable


In [185]:
#Sets
x_train, x_test, y_train, y_test= train_test_split(all_cleaned, all_labels, test_size=0.25)

training_labels= np.array(y_train)
testing_labels= np.array(y_test)

In [186]:
#Variables
vocab_size= 10000
max_length= 32
trunc_type= "post"
padding_type= "post"
oov_tok= "<OOV>"

#Tokenize
tokenizer= Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index= tokenizer.word_index

#Train sentences
training_sequences= tokenizer.texts_to_sequences(x_train)
training_padded= pad_sequences(training_sequences, maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

#Test sentences
testing_sequences= tokenizer.texts_to_sequences(x_test)
testing_padded= pad_sequences(testing_sequences, maxlen=max_length, 
                              padding=padding_type, truncating=trunc_type)

In [187]:
embedding_dim= 16

#Keras model
model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 16)            160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [188]:
#Train
num_epochs= 30

history= model.fit(training_padded, training_labels, epochs=num_epochs,
                   validation_data=(testing_padded, testing_labels), verbose=2)

Train on 102 samples, validate on 35 samples
Epoch 1/30
102/102 - 1s - loss: 0.6926 - accuracy: 0.2353 - val_loss: 0.6649 - val_accuracy: 0.1429
Epoch 2/30
102/102 - 0s - loss: 0.6607 - accuracy: 0.2451 - val_loss: 0.6326 - val_accuracy: 0.1429
Epoch 3/30
102/102 - 0s - loss: 0.6339 - accuracy: 0.2451 - val_loss: 0.5997 - val_accuracy: 0.1429
Epoch 4/30
102/102 - 0s - loss: 0.6037 - accuracy: 0.2451 - val_loss: 0.5650 - val_accuracy: 0.1429
Epoch 5/30
102/102 - 0s - loss: 0.5724 - accuracy: 0.2451 - val_loss: 0.5278 - val_accuracy: 0.1429
Epoch 6/30
102/102 - 0s - loss: 0.5387 - accuracy: 0.2451 - val_loss: 0.4883 - val_accuracy: 0.1429
Epoch 7/30
102/102 - 0s - loss: 0.5014 - accuracy: 0.2451 - val_loss: 0.4462 - val_accuracy: 0.1429
Epoch 8/30
102/102 - 0s - loss: 0.4638 - accuracy: 0.2451 - val_loss: 0.4006 - val_accuracy: 0.1429
Epoch 9/30
102/102 - 0s - loss: 0.4203 - accuracy: 0.2451 - val_loss: 0.3524 - val_accuracy: 0.1429
Epoch 10/30
102/102 - 0s - loss: 0.3754 - accuracy: 0.2

In [189]:
e= model.layers[0]
weights= e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [190]:
rev_vocab= dict([(value,key) for (key,value) in word_index.items()])
print(rev_vocab)



In [191]:
#TSV
out_v= io.open('vecs.tsv', 'w', encoding='utf-8')
out_m= io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word= rev_vocab[word_num]
    embeddings= weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()