In [1]:
# Blake Myers and Nicolas Stencel, Part 2 of the final project for CSCE 5210 
# Creating a simple neural network to extract info about our text

In [2]:
import stanza_nlp as nlp
import csv
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from collections import defaultdict, Counter
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model

In [3]:
def tsv2mat(fname) :
  with open(fname) as f:
     wss = csv.reader(f, delimiter='\t')
     return list(wss)

class Data :
  def __init__(self,fname='texts/english') :
    wss = tsv2mat("out/"+fname+".tsv")
    self.sents=tsv2mat("out/"+fname+"_sents.tsv")
    occs=defaultdict(set)
    sids=set()
    for f,r,t,id in wss:
      id=int(id)
      occs[(f,r,t)].add(id)
      sids.add(id)
    self.occs=occs

    X,Y=list(zip(*list(occs.items())))
    X = np.array(X)
    y0 = np.array(sorted(map(lambda x:[x],sids)))

    enc_X = OneHotEncoder(handle_unknown='ignore')
    enc_y = OneHotEncoder(handle_unknown='ignore')
    enc_X.fit(X)
    enc_y.fit(y0)
    hot_X = enc_X.transform(X).toarray()
    self.enc_X = enc_X
    self.enc_y = enc_y
    self.X=X
    ms=[]
    for ys in Y :
      m = np.array([[0]],dtype=np.float32)
      for v in ys :
        m0=enc_y.transform(np.array([[v]])).toarray()
        m = np.logical_or(m,m0)
        m=np.array(np.logical_or(m,m0),dtype=np.float32)
      ms.append(m[0])
    hot_y=np.array(ms)

    self.hot_X=hot_X
    self.hot_y =hot_y

    print('\nFINAL DTATA SHAPES','X',hot_X.shape,'y',hot_y.shape,'\n')

class Query(Data) :
  def __init__(self,fname='texts/english'):
    super().__init__(fname=fname)
    self.nlp_engine=nlp.NLP()

  def ask(self,text=None):
    if not text: text = input("Query:")
    else: print("Query:",text)

    self.nlp_engine.from_text(text)
    sids=[]
    for f,r,t,_ in self.nlp_engine.facts() :
      sids.extend(self.occs.get((f,r,t),[]))
    self.show_answers(sids)

  def show_answers(self, sids, k=3):
    c = Counter(sids)
    print('\nHIT COUNTS:',c,"\n")
    best = c.most_common(k)
    for sid, _ in best:
      id, sent = self.sents[sid]
      print(id, ':', sent)
    print("")

class Inferencer(Query) :
  def __init__(self,fname='texts/english'):
    super().__init__(fname=fname)
    self.model = load_model(fname+"_model")

  def query(self,text=None):
    if not text: text = input("Query:")
    else: print("Query:", text)
    self.nlp_engine.from_text(text)
    X=[]
    for f, r, t, _ in self.nlp_engine.facts():
      X.append([f,r,t])
    X = np.array(X)
    hot_X = self.enc_X.transform(X).toarray()
    y=self.model.predict(hot_X)
    m=self.enc_y.inverse_transform(y)
    sids=m.flatten().tolist()
    self.show_answers(sids)

class Trainer(Data) :
  def __init__(self,fname='texts/english'):
    super().__init__(fname=fname)
    model = keras.Sequential()
    model.add(layers.Dense(128, input_dim=self.hot_X.shape[1], activation='relu'))
    model.add(layers.Dense(self.hot_y.shape[1], activation='sigmoid'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(self.hot_X, self.hot_y, epochs=100, batch_size=16)

    model.save(fname+"_model")

    loss, accuracy = model.evaluate(self.hot_X, self.hot_y)
    print('Accuracy:', round(100 * accuracy, 2), ', % Loss:', round(100 * loss, 2), '%')


In [4]:
def qtest():
    q=Query()
    q.ask(text="What do I fear above all things?")
    q.ask(text="What celebrated my arrival?")
    q.ask(text="At what time of day was I born?")
    q.ask(text="What has no further use for me and is running out?")
    q.ask(text="Which doctor had a nursing home I was born in?")
    q.ask(text="Who prophesied me?")
    q.ask(text="What type of home was I born in?")
    q.ask(text="Who broke his toe?")
    q.ask(text="What city was I born in?")
    q.ask(text="What is crumbling and overused?")
    
def dtest():
    d=Data()
    print("X",d.hot_X.shape)
    print(d.hot_X)
    print("y",d.hot_y.shape)
    print(d.hot_y)

def dtests():
    dtest('out/texts/english.tsv')

def ntest() : 
    t=Trainer()
    i=Inferencer()
    print("\n\n")
    print("ALGORITHMICALLY DERIVED ANSWERS:\n")
    i.ask(text="What do I fear above all things?")
    i.ask(text="What celebrated my arrival?")
    i.ask(text="At what time of day was I born?")
    i.ask(text="What has no further use for me and is running out?")
    i.ask(text="Which doctor had a nursing home I was born in?")
    i.ask(text="Who prophesied me?")
    i.ask(text="What type of home was I born in?")
    i.ask(text="Who broke his toe?")
    i.ask(text="What city was I born in?")
    i.ask(text="What is crumbling and overused?")
    
    print("\n")
    print("NEURAL NET'S ANSWERS:\n")
    i.query(text="What do I fear above all things?")
    i.query(text="What celebrated my arrival?")
    i.query(text="At what time of day was I born?")
    i.query(text="What has no further use for me and is running out?")
    i.query(text="Which doctor had a nursing home I was born in?")
    i.query(text="Who prophesied me?")
    i.query(text="What type of home was I born in?")
    i.query(text="Who broke his toe?")
    i.query(text="What city was I born in?")
    i.query(text="What is crumbling and overused?")
    
if __name__=="__main__" :
  ntest()


FINAL DTATA SHAPES X (423, 415) y (423, 25) 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               53248     
_________________________________________________________________
dense_1 (Dense)              (None, 25)                3225      
Total params: 56,473
Trainable params: 56,473
Non-trainable params: 0
_________________________________________________________________
Train on 423 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: texts/english_model/assets


Accuracy: 98.82 , % Loss: 3.18 %

FINAL DTATA SHAPES X (423, 415) y (423, 25) 




ALGORITHMICALLY DERIVED ANSWERS:

Query: What do I fear above all things?

HIT COUNTS: Counter({24: 4}) 

24 : I admit it: above all things, I fear absurdity.

Query: What celebrated my arrival?

HIT COUNTS: Counter({14: 2}) 

14 : Soothsayers had prophesied me, newspapers celebrated my arrival, politicos ratified my authenticity.

Query: At what time of day was I born?

HIT COUNTS: Counter({0: 2, 2: 2}) 

0 : I was born in the city of Bombay … once upon a time.
2 : I was born in Doctor Narlikar’s Nursing Home on August 15th, 1947.

Query: What has no further use for me and is running out?

HIT COUNTS: Counter({18: 7}) 

18 : Now, however, time (having no further use for me) is running out.

Query: Which doctor had a nursing home I was born in?

HIT COUNTS: Counter({0: 2, 2: 2}) 

0 : I was born in the city of Bombay … once upon a time.
2 : I was born in Doctor Narlikar’s Nursing Home on August 15th, 194