In [1]:
import json
import tensorflow as tf

import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import io
import re

In [2]:
def json_to_dict(file):
    diction= open(file)
    return json.load(diction)

def plain_to_dict(file):
    f= open(file, encoding="utf8")
    return json.load(f)

def clean_text(rgx_list, text):
    new_text = text.lower()
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)   
    return new_text

In [3]:
#Geology
geo1= json_to_dict('geologic_processes.json')
geo2= plain_to_dict('age_of_earth')
#Paleontolgy
paleo= plain_to_dict('paleontology.json')
#Cosmology
cosmo= plain_to_dict('cosmology')
#Ecology
eco= plain_to_dict('ecology_conservation')
#Faith and science
faith= plain_to_dict('faith_and_science')
#Intelligent desgin
design= plain_to_dict('inteligentDesign.json')
#Biology
bio= plain_to_dict('Biology.json')
#Other sciences

#Join Geo
geo= {"geology": list(geo1.values())[0]+list(geo2.values())[0]}

#all_data= plain_to_dict('allDataset.json')

In [4]:
all_data= {
            'geology': geo,
            'paleontology': paleo,
            'cosmology': cosmo,
            'ecology': eco,
            'faith': faith,
            'design': design,
            'biology': bio
          }
#print(all_data)
#import json
with open('all_data.json', 'w') as fp:
    json.dump(all_data, fp)

In [45]:
all_arts= []
all_labels= []

for count,top in enumerate([geo,paleo,cosmo,eco,faith,design,bio]):
#for count,top in enumerate([all_data]):
    arts= []

    for art in list(top.values())[0][:30]:
        arts.append(art['doc'])
    labels= [count]*len(arts)
    
    all_arts.append(arts)
    all_labels.append(labels)
    #print(arts)
    #print(labels)


all_arts= [item for sublist in all_arts for item in sublist]
all_labels= [item for sublist in all_labels for item in sublist]

patterns= [r'[\t\n\r\f\v\d]', r'(.+doi.+(Summary\. ?|\n)|.+DOI.+(Summary\. ?|\n))',
           r'\[[\w\.;,\- ]+\]', r'\([\w\.;,\- ]+\)'#, r'“[\w+]”', r'[\w+]”', r'“[\w+]',
           r'[F|f]ig[\.\w]+ [\w,\- ]+|see [F|f]ig[\.\w]+ [\w,\- ]+',# r'[\"]',
           r'[.,\/#!$%\^&\*;:{}=\-\[\]_`~()“”…—\"\"‘’\'\'>–]',# r'[\']',r'[\’]',
           r'WHAT THIS ARTICLE IS ABOUT|ABSTRACT|INTRODUCTION|ACKNOWLEDGMENTS?|CONCLUSIONS?|Conclusions?|SUMMARY|DISCUSSION|Geoscience Research Institute',
           #r'\bthe\b',r'\bof\b',r'\bin\b',r'\bare\b',r'\bthe[m|y]\b',r'\bi(s|f|t)\b',r'\ba(n|ll|nd|t)?\b',r'\bto\b',r'\bwhich\b',r'\bby\b',
           #r'\bf?or\b',r'\bthose\b',r'\bfrom\b',r'\bto\b',r'\bbut\b',r'\bmuch\b',r'\bthese\b',r'\bthere\b',r'\bwith\b',r'\xa0',
           r'\bthese\b',r'\bthere\b',r'\bwhich\b',r'\xa0',
           r'\b\w{,5}\b',r'\b\w{18,}\b']

all_cleaned= []
for art in all_arts:
    art= re.split('ENDNOTES|Endnotes|REFERENCES|References|Footnotes|FOR FURTHER STUDY|LITERATURE CITED', art)[0]
    #art= clean_text(patterns, art)
    all_cleaned.append(clean_text(patterns, art))

In [46]:
re2= 'Dinosaaaur fossils at are –- them \"widespre\'ad \xa0all in ‘penn>sylvan-i’aeuringer layer—were'
new_text= re.sub(r'[.,\/#!$%\^&\*;:{}=\-\[\]_`~()“”…—\"\"‘’\'\'>–]', '', re2)
#new_text2= re.sub(r'\b\w{20,}\b', '', new_text)
print(re2)
print(new_text)
#print(new_text2)
#print(all_cleaned[0])

Dinosaaaur fossils at are –- them "widespre'ad  all in ‘penn>sylvan-i’aeuringer layer—were
Dinosaaaur fossils at are  them widespread  all in pennsylvaniaeuringer layerwere


In [47]:
#Sets
x_train, x_test, y_train, y_test= train_test_split(all_cleaned, all_labels, test_size=0.25)

training_labels= np.array(y_train)
testing_labels= np.array(y_test)

In [48]:
#Variables
vocab_size= 10000
max_length= 32
trunc_type= "post"
padding_type= "post"
oov_tok= "<OOV>"

#Tokenize
tokenizer= Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index= tokenizer.word_index

#Train sentences
training_sequences= tokenizer.texts_to_sequences(x_train)
training_padded= pad_sequences(training_sequences, maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

#Test sentences
testing_sequences= tokenizer.texts_to_sequences(x_test)
testing_padded= pad_sequences(testing_sequences, maxlen=max_length, 
                              padding=padding_type, truncating=trunc_type)

In [49]:
embedding_dim= 16

#Keras model
model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 16)            160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [50]:
#Train
num_epochs= 30

history= model.fit(training_padded, training_labels, epochs=num_epochs,
                   validation_data=(testing_padded, testing_labels), verbose=2)

Train on 125 samples, validate on 42 samples
Epoch 1/30
125/125 - 1s - loss: 0.6701 - accuracy: 0.1760 - val_loss: 0.6385 - val_accuracy: 0.1667
Epoch 2/30
125/125 - 0s - loss: 0.6177 - accuracy: 0.1840 - val_loss: 0.5862 - val_accuracy: 0.1667
Epoch 3/30
125/125 - 0s - loss: 0.5572 - accuracy: 0.1840 - val_loss: 0.5291 - val_accuracy: 0.1667
Epoch 4/30
125/125 - 0s - loss: 0.4926 - accuracy: 0.1840 - val_loss: 0.4672 - val_accuracy: 0.1667
Epoch 5/30
125/125 - 0s - loss: 0.4213 - accuracy: 0.1840 - val_loss: 0.4015 - val_accuracy: 0.1667
Epoch 6/30
125/125 - 0s - loss: 0.3442 - accuracy: 0.1840 - val_loss: 0.3315 - val_accuracy: 0.1667
Epoch 7/30
125/125 - 0s - loss: 0.2656 - accuracy: 0.1840 - val_loss: 0.2564 - val_accuracy: 0.1667
Epoch 8/30
125/125 - 0s - loss: 0.1756 - accuracy: 0.1840 - val_loss: 0.1763 - val_accuracy: 0.1667
Epoch 9/30
125/125 - 0s - loss: 0.0782 - accuracy: 0.1840 - val_loss: 0.0906 - val_accuracy: 0.1667
Epoch 10/30
125/125 - 0s - loss: -2.1786e-02 - accuracy

In [51]:
e= model.layers[0]
weights= e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [52]:
rev_vocab= dict([(value,key) for (key,value) in word_index.items()])
print(rev_vocab)






In [54]:
#TSV
out_v= io.open('vecs.tsv', 'w', encoding='utf-8')
out_m= io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word= rev_vocab[word_num]
    embeddings= weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()