In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
import json
import random
import os

# mount your google drive to load/save data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Function

In [None]:
############ Get Data ##########
# TODO: review encoding
# NOTE: f.read() reads as chars, f.readlines() makes a list separated by newlines
def get_raw_data(file_path):
    with open(file_path, "r", encoding='utf-8') as f:
        raw_lines = f.readlines()
    return raw_lines

In [None]:
def create_index(texts, filename):
    words = texts.split()
    tokenizer = Tokenizer(num_words=1000000)
    tokenizer.fit_on_texts(words)
    sequences = tokenizer.texts_to_sequences(words)
    word_index = tokenizer.word_index
    print(f"Found {len(word_index)} unique words.")
    with open (filename, "w") as f:
        json.dump(word_index,f, indent=4)

In [None]:
def get_index(filename):
    with open(filename,"r") as f:
        data = json.load(f)
    return data

In [None]:
def create_sents(text):
    sentences = text.split("\n")
    return sentences

In [None]:
def padding_data(sentences, index, maxlen=280): # convert string sentences to numerical array for model to understand
    new_sentences = []
    for sentence in sentences:
        sentence = text_to_word_sequence(sentence)
        new_sentence = []
        words = []
        for word in sentence:
            try:
                word = index[word]
            except:
                KeyError
                word = 0
                # When encountering new text, the model needs to understand it numerically
                # Making it a 0 is called "zero padding"
            words.append(word)
        new_sentence.append(words)
        new_sentence = preprocessing.sequence.pad_sequences(new_sentence, maxlen=maxlen, padding="post") #"post" has padding at the end, whereas standard does it in the front
        new_sentences.append(new_sentence[0])
    return new_sentences

In [None]:
def reverse_index(word_index):
    reverse_word_index = {value:key for (key,value) in word_index.items()} # Straight from Keras docs
    return reverse_word_index

def reconst_text(text, reverse_word_index):
    # reconstitutes text as a series of words
    return (" ".join([reverse_word_index.get(i, "?") for i in text]))
    # Because 0 is not in the word index, it needs to be replaced with something. Here, "?"

In [None]:
################# Labelling the Data ###############

def label_data(sentences, label):
    total_chunks = []
    for sentence in sentences:
        total_chunks.append((sentence, label))
    return total_chunks

In [None]:
########## Create Training Data ###########
def create_training(total_chunks, cutoff):
    random.shuffle(total_chunks)
    training_data = []
    training_labels = []
    testing_data = []
    testing_labels = []
    # cutoff is from 0 to 1 and determines ratio of chunk to be testing or training data
    test_num = len(total_chunks)*cutoff
    x = 0
    for entry in total_chunks:
        if x > test_num:
            testing_data.append(entry[0])
            testing_labels.append(entry[1])
        else:
            training_data.append(entry[0])
            training_labels.append(entry[1])
        x += 1
    training_data = np.array(training_data)
    training_labels = np.array(training_labels)
    testing_data = np.array(testing_data)
    testing_labels = np.array(testing_labels)
    
    return training_data, training_labels, testing_data, testing_labels

In [None]:
################# Creating Model ####################
def create_model():
    model = keras.Sequential() # Has exactly one input tensor and one output tensor

    model.add(keras.layers.Embedding(1000000, 25)) # When the NN looks at the incoming data and starts vectorizing and performing word embedding
        # First arg is breadth of data, second is dimenion of words, the shape of the embedding layer
    model.add(keras.layers.GlobalAveragePooling1D()) # Allows for NN to flatten data and understand more quickly
    model.add(keras.layers.Dense(16, activation="relu")) # Most commonly used layer for text. First arg is dimensions
        # Activation is the mathematical process performed on the data
    model.add(keras.layers.Dense(16, activation="tanh"))
    model.add(keras.layers.Dense(1, activation="sigmoid")) #output
        # Sigmoid is common binary output for text classifications, because it interprets data in a binary way
    model.summary()
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    return model

In [None]:
################# Train Model #######################
def train_model(model, tt_data, val_size=.3, epochs=1, batch_size=16):
    vals = int(len(tt_data[0])*val_size)
    training_data = tt_data[0]
    training_labels = tt_data[1]
    testing_data = tt_data[2]
    testing_labels = tt_data[3]
    x_val = training_data[:vals]
    x_train = training_data[vals:]

    y_val = training_labels[:vals]
    y_train = training_labels[vals:]

    fitModel = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val,y_val), verbose=1, shuffle=True)
    model_results = model.evaluate(testing_data, testing_labels)

    return model

### Main

To improve model: 

*   Set tm_30_2 epochs to 20 to prevent overfitting
*   Compare the data rejected by tm30 and tm302

In [None]:
gib_data_path = "/content/gdrive/MyDrive/Colab Notebooks/training/gib_master"
gold_data_path = "/content/gdrive/MyDrive/Colab Notebooks/training/gold_master"

gib_data = get_raw_data(gib_data_path)
gold_data = get_raw_data(gold_data_path)

#create_index(gold_data, "/content/gdrive/MyDrive/Colab Notebooks/word_index")

word_index = get_index("/content/gdrive/MyDrive/Colab Notebooks/word_index")
reverse_word_index = reverse_index(word_index)

#gold_words = gold_data.split()
#gib_words = gib_data.split()

gold_sents = create_sents(gold_data)
gib_sents = create_sents(gib_data)

#print(gold_sents[:10])
#print(gib_sents[:10])

gold_padded = padding_data(gold_sents, word_index, maxlen=280)
gib_padded = padding_data(gib_sents, word_index, maxlen=280)

print(gold_padded[0])
print(gib_padded[0])

gold_labeled = label_data(gold_padded, 0)
gib_labeled = label_data(gib_padded, 1)

print(gold_labeled[0])
print(gib_labeled[0])

all_data = gold_labeled + gib_labeled
tt_data = create_training(all_data, cutoff=.9)

model = create_model()

model = train_model(model, tt_data=tt_data, epochs=20, batch_size=16)

model.save("/content/gdrive/MyDrive/Colab Notebooks/models/token_model_20")

[ 1087  7558     1   246   338 10717     1   628     2     1   686    19
 10718   230     6  1088     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   