## Imports

In [2]:
import tensorflow as tf
import json
import models
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [3]:
EXAMINE = 21
SEED = 22
np.random.seed(SEED)

## Load and Preprocess Training Data

In [33]:
def get_gender_as_num(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [34]:
def get_age_group(age): # HIGH NOTE: changing each of the scalars to a vector. This is probably not a good idea
    if age < 18:
        # 13 - 17
        return [1, 0, 0]
    elif age < 28:
        # 23 - 27
        return [0, 1, 0]
    elif age < 49:
        # 33 - 48
        return [0, 0, 1]
    else:
        return [0, 0, 0]

In [35]:
blog_posts_data_dir = "data/blogs/json-data/"
train_file_name = "train.json"
test_file_name = "test.json"

# Load data
with open(blog_posts_data_dir + train_file_name) as r:
    training_set = json.load(r)
raw_posts = [instance["post"] for instance in training_set]

In [36]:
print(raw_posts[EXAMINE])

The kids had a great time at the pool yesterday afternoon splashing around and diving for pennies.  While we were there, a reporter came around from the local paper and was snapping pictures of the kids and taking names.  He took special interest in the fact it was my son's birthday and took some extra pictures of him and his brother.  So, I guess I need to check out this weekend's edition of the paper to see if the kids pictures show up.  The children would get a real kick out of it to see their pictures in the newspaper.    My son also got to take treats to his church group last night and they all sang "Happy Birthday" to him - he loved the attention!  The leaders at that group are amazing!  The staff members that are men are wonderful role models for both the children.  Until we meet again......


In [37]:
median_words_per_sample = np.median([len(instance["post"]) for instance in training_set])

# Map each word to a unique int value
MAX_WORD_COUNT = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = MAX_WORD_COUNT)
posts = [instance["post"] for instance in training_set]
tokenizer.fit_on_texts(posts)
word_index = dict(list(tokenizer.word_index.items())[:20000])
sequences = tokenizer.texts_to_sequences(posts)
median_words_per_tokenized_sample = np.median([len(post) for post in sequences])
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = int(median_words_per_tokenized_sample),
                                                     padding = "post")
for i, instance in enumerate(training_set):
    instance["post"] = data[i]
    instance["gender"] = get_gender_as_num(instance["gender"])
    instance["age"] = get_age_group(int(instance["age"]))


In [38]:
print(training_set[EXAMINE]["post"])
print(training_set[EXAMINE]["age"])

[  255   679  1211     7     1   323     9    14    10 10313   533     4
   255    55  1119   604     6    79     4    58   608    18     2   211
     2   140     3   434    32    19 15890  3749     6     1   671     3
    85    40     1   397   604   262    33     1   605    67    44     5
   311  1251    32     6     9     3    85    91   604     7     1  2458
    10   983   129    77     3   139  5998     3    58   545   480   102
   125     4    36    26  2704   225   533     3    79    27   708     1
   860     1  2608    24     8   480    30   653     1  1744  1148     8
    30   499    30   741  1638  5324    13   282     1   605   227    23
   554   130]
[0, 0, 1]


In [44]:
print(list(word_index.items())[ : 100])

[('the', 1), ('i', 2), ('to', 3), ('and', 4), ('a', 5), ('of', 6), ('in', 7), ('that', 8), ('it', 9), ('my', 10), ('is', 11), ('you', 12), ('for', 13), ('was', 14), ('on', 15), ('me', 16), ('but', 17), ('so', 18), ('this', 19), ('with', 20), ('have', 21), ('be', 22), ('we', 23), ('at', 24), ('not', 25), ('all', 26), ('he', 27), ('as', 28), ('like', 29), ('are', 30), ('just', 31), ('out', 32), ('up', 33), ('about', 34), ("i'm", 35), ('they', 36), ('what', 37), ('or', 38), ('one', 39), ('if', 40), ('from', 41), ('do', 42), ('had', 43), ('get', 44), ('when', 45), ('urllink', 46), ('will', 47), ('there', 48), ('her', 49), ('she', 50), ('time', 51), ('know', 52), ('now', 53), ('can', 54), ('some', 55), ('then', 56), ('by', 57), ('his', 58), ("it's", 59), ('really', 60), ('no', 61), ('an', 62), ('your', 63), ('go', 64), ('more', 65), ('am', 66), ('would', 67), ('think', 68), ("don't", 69), ('well', 70), ('who', 71), ('people', 72), ('good', 73), ('been', 74), ('has', 75), ('how', 76), ('got'

## Find Key Metrics

In [45]:
samples_count = len(training_set)

categories_count = len(training_set[0]["age"])

samples_per_class = {0 : 0, 1 : 0, 2 : 0}
for instance in training_set:
    for i, a in enumerate(instance["age"]):
        if a == 1:
            samples_per_class[i] += 1
            break
 

In [69]:
print("Number of Samples:", samples_count)
print("Number of Categories:", categories_count)
print("Samples per Class:", samples_per_class)
print("Median Words per Sample:", median_words_per_sample)
print("Median Words per Tokenized Sample:", median_words_per_tokenized_sample)
print("Samples to Words Per Sample Ratio:", samples_count / median_words_per_tokenized_sample)

Number of Samples: 526812
Number of Categories: 3
Samples per Class: {0: 177940, 1: 250672, 2: 98200}
Median Words per Sample: 621.0
Median Words per Tokenized Sample: 110.0
Samples to Words Per Sample Ratio: 4789.2


In [47]:
# plt.hist(list(length_distribution.keys()))
# plt.xlabel("Length of a Sample")
# plt.ylabel("Number of samples")
# plt.show()

## Import Pretrained Embeddings

In [48]:
EMBEDDING_DIM = 50

glove_path = "data/embeddings/glove.6B/"
glove_dict = {}
with open(glove_path + "glove.6B.50d.txt") as f:
    for line in f:
        line_values = line.split(" ")
        word = line_values[0]
        embedding_coefficients = np.asarray(line_values[1 : ], dtype = "float32")
        glove_dict[word] = embedding_coefficients

glove_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    glove_vector = glove_dict.get(word)
    if glove_vector is not None:
        glove_weights[i] = glove_vector

In [53]:
print(len(glove_weights))

20001


## Define the Model

[An Introduction to Different Types of Convolutions](https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d)

In [54]:
# Define the model
# Input, Embedding, Conv, Pool, Conv, Pool, Flatten, Dense, Dense
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights = [glove_weights],
                                    input_length = median_words_per_tokenized_sample, trainable = True))
model.add(tf.keras.layers.SeparableConv1D(50, 5, activation = "relu"))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.SeparableConv1D(100, 3, activation = "relu"))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(24, activation = "sigmoid"))
model.add(tf.keras.layers.Dense(3, activation = "softmax"))


## Train the Model

In [65]:
posts_train = np.array([instance["post"] for instance in training_set])
ages_train = np.array([instance["age"] for instance in training_set])

In [66]:
model.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["acc"])
model.summary()
history = model.fit(posts_train, ages_train, epochs = 10, batch_size = 500, validation_split = 0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 110, 50)           1000050   
_________________________________________________________________
separable_conv1d_4 (Separabl (None, 106, 50)           2800      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 53, 50)            0         
_________________________________________________________________
separable_conv1d_5 (Separabl (None, 51, 100)           5250      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 25, 100)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                60024     
__________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 421449 samples, validate on 105363 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Save Model

In [67]:
models_dir = "models/"

In [68]:
model.save(models_dir + "2sep-conv_2dense.h5")

## Test Model

In [70]:
# Load data
with open(blog_posts_data_dir + test_file_name) as r:
    test_set = json.load(r)

In [71]:
test_posts = [instance["post"] for instance in test_set]
test_sequences = tokenizer.texts_to_sequences(test_posts)
test_post_data = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen = int(median_words_per_tokenized_sample),
                                                     padding = "post")
for i, instance in enumerate(test_set):
    instance["post"] = test_post_data[i]
    instance["gender"] = get_gender_as_num(instance["gender"])
    instance["age"] = get_age_group(int(instance["age"]))

In [72]:
posts_test = np.array([instance["post"] for instance in test_set])
ages_test = np.array([instance["age"] for instance in test_set])

In [73]:
model.evaluate(posts_test, ages_test)



[0.7650201190724795, 0.6507368852632148]