## Imports

In [1]:
import tensorflow as tf
import json
import models
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
EXAMINE = 21
SEED = 22
np.random.seed(SEED)

## Load Training Data

In [3]:
def get_gender_as_num(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [4]:
def get_age_group(age): # HIGH NOTE: changing each of the scalars to a vector. This is probably not a good idea
    if age < 18:
        # 13 - 17
        return [1, 0, 0]
    elif age < 28:
        # 23 - 27
        return [0, 1, 0]
    elif age < 49:
        # 33 - 48
        return [0, 0, 1]
    else:
        return [0, 0, 0]

In [5]:
blog_posts_data_dir = "data/blogs/json-data/"
train_file_name = "train.json"
test_file_name = "test.json"

# Load data
with open(blog_posts_data_dir + train_file_name) as r:
    training_set = json.load(r)
    print(training_set[EXAMINE]["post"])

median_words_per_sample = np.median([len(instance["post"]) for instance in training_set])

# Map each word to a unique int value
MAX_WORD_COUNT = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = MAX_WORD_COUNT)
posts = [instance["post"] for instance in training_set]
tokenizer.fit_on_texts(posts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(posts)
median_words_per_tokenized_sample = np.median([len(post) for post in sequences])
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = int(median_words_per_tokenized_sample),
                                                     padding = "post")
for i, instance in enumerate(training_set):
    instance["post"] = data[i]
    instance["gender"] = get_gender_as_num(instance["gender"])
    instance["age"] = get_age_group(int(instance["age"]))
print(training_set[EXAMINE]["post"])


The kids had a great time at the pool yesterday afternoon splashing around and diving for pennies.  While we were there, a reporter came around from the local paper and was snapping pictures of the kids and taking names.  He took special interest in the fact it was my son's birthday and took some extra pictures of him and his brother.  So, I guess I need to check out this weekend's edition of the paper to see if the kids pictures show up.  The children would get a real kick out of it to see their pictures in the newspaper.    My son also got to take treats to his church group last night and they all sang "Happy Birthday" to him - he loved the attention!  The leaders at that group are amazing!  The staff members that are men are wonderful role models for both the children.  Until we meet again......
[  255   679  1211     7     1   323     9    14    10 10313   533     4
   255    55  1119   604     6    79     4    58   608    18     2   211
     2   140     3   434    32    19 15890  

## Find Key Metrics

In [6]:
samples_count = len(training_set)

categories_count = len(training_set[0]["age"])

samples_per_class = {0 : 0, 1 : 0, 2 : 0}
for instance in training_set:
    for i, a in enumerate(instance["age"]):
        if a == 1:
            samples_per_class[i] += 1
            break
 

In [7]:
print("Number of Samples:", samples_count)
print("Number of Categories:", categories_count)
print("Samples per Class:", samples_per_class)
print("Median Words per Sample:", median_words_per_sample)
print("Median Words per Tokenized Sample:", median_words_per_tokenized_sample)
print("Samples to Words Per Sample Ratio:", samples_count / median_words_per_sample)

Number of Samples: 526812
Number of Categories: 3
Samples per Class: {0: 177940, 1: 250672, 2: 98200}
Median Words per Sample: 621.0
Median Words per Tokenized Sample: 110.0
Samples to Words Per Sample Ratio: 848.328502415459


In [8]:
# plt.hist(list(length_distribution.keys()))
# plt.xlabel("Length of a Sample")
# plt.ylabel("Number of samples")
# plt.show()

## Import Pretrained Embeddings

In [9]:
EMBEDDING_DIM = 50

glove_path = "data/embeddings/glove.6B/"
glove_dict = {}
with open(glove_path + "glove.6B.50d.txt") as f:
    for line in f:
        line_values = line.split(" ")
        word = line_values[0]
        embedding_coefficients = np.asarray(line_values[1 : ], dtype = "float32")
        glove_dict[word] = embedding_coefficients

glove_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    glove_vector = glove_dict.get(word)
    if glove_vector is not None:
        glove_weights[i] = glove_vector

## Define the Model

[An Introduction to Different Types of Convolutions](https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d)

In [50]:
# Define the model
# Input, Embedding, Conv, Pool, Conv, Pool, Flatten, Dense, Dense

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights = [glove_weights],
                                    input_length = median_words_per_sample, trainable = True))
model.add(tf.keras.layers.SeparableConv1D(50, 5, activation = "relu"))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.SeparableConv1D(100, 3, activation = "relu"))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(24, activation = "sigmoid"))
model.add(tf.keras.layers.Dense(3, activation = "softmax"))


## Train the Model

In [46]:
posts_train = [instance["post"] for instance in training_set]
ages_train = [instance["age"] for instance in training_set]

In [47]:
model.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["acc"])
model.summary()
history = model.fit(np.array(posts_train[:1000]), np.array(ages_train[:1000]), epochs = 10, batch_size = 500, validation_split = 0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 621, 50)           38312400  
_________________________________________________________________
separable_conv1d_5 (Separabl (None, 617, 50)           2800      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 308, 50)           0         
_________________________________________________________________
separable_conv1d_6 (Separabl (None, 306, 100)          5250      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 153, 100)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 15300)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                367224    
__________

ValueError: Error when checking input: expected embedding_3_input to have shape (621,) but got array with shape (110,)