## Imports

In [1]:
import tensorflow as tf
import json
import models
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
EXAMINE = 1
SEED = 22
np.random.seed(SEED)

## Load Training Data

In [3]:
def get_gender_as_num(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [4]:
def get_age_group(age): # HIGH NOTE: changing each of the scalars to a vector. This is probably not a good idea
    if age < 18:
        # 13 - 17
        return [1, 0, 0]
    elif age < 28:
        # 23 - 27
        return [0, 1, 0]
    elif age < 49:
        # 33 - 48
        return [0, 0, 1]
    else:
        return [0, 0, 0]

In [6]:
blog_posts_data_dir = "data/blogs/json-data/"
train_file_name = "train.json"
test_file_name = "test.json"

# Load data
with open(blog_posts_data_dir + train_file_name) as r:
    training_set = json.load(r)
    print(training_set[EXAMINE]["post"])

# Map each word to a unique int value
MAX_WORD_COUNT = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = MAX_WORD_COUNT)
posts = [instance["post"] for instance in training_set]
tokenizer.fit_on_texts(posts)
sequences = tokenizer.texts_to_sequences(posts)
for i, instance in enumerate(training_set):
    instance["post"] = sequences[i]
    instance["gender"] = get_gender_as_num(instance["gender"])
    instance["age"] = get_age_group(int(instance["age"]))
print(training_set[EXAMINE]["post"])

# def prepare_sequence(seq, word_to_int):
#     ints = [word_to_int[w] for w in seq]
#     return torch.tensor(ints, dtype = torch.long)


so wuts up? today i had the parade. suked. but it wasnt that bad. im done with band for the year. we had a battle today. we kicked ass.  they had nothing.  then jims party.  then my snotty little cousins bday party. i dun like her. sublime. out.
[18, 33, 97, 2, 43, 1, 3529, 17, 9, 1809, 8, 177, 200, 234, 20, 548, 13, 1, 169, 23, 43, 5, 1776, 97, 23, 2024, 457, 36, 43, 203, 56, 387, 56, 10, 16243, 122, 3032, 4481, 387, 2, 1020, 29, 49, 11763, 32]


In [11]:
vocab_size = len(word_to_int.values())

## Find Key Metrics

In [6]:
samples_count = len(training_set)

categories_count = len(training_set[0]["age"])

samples_per_class = {0 : 0, 1 : 0, 2 : 0}
for instance in training_set:
    for i, a in enumerate(instance["age"]):
        if a == 1:
            samples_per_class[i] += 1
            break

median_words_per_sample = np.median([len(instance["post"]) for instance in training_set])
 

In [7]:
print("Number of Samples:", samples_count)
print("Number of Categories:", categories_count)
print("Samples per Class:", samples_per_class)
print("Median Words per Sample:", median_words_per_sample)
print("Samples to Words Per Sample Ratio:", samples_count / median_words_per_sample)

Number of Samples: 526812
Number of Categories: 3
Samples per Class: {0: 177940, 1: 250672, 2: 98200}
Median Words per Sample: 121.0
Samples to Words Per Sample Ratio: 4353.818181818182


In [8]:
# plt.hist(list(length_distribution.keys()))
# plt.xlabel("Length of a Sample")
# plt.ylabel("Number of samples")
# plt.show()

## Define the Model

[An Introduction to Different Types of Convolutions](https://towardsdatascience.com/types-of-convolutions-in-deep-learning-717013397f4d)

In [None]:
# Define the model
EMBEDDING_DIM = 50

# Input, Embedding, Conv, Conv, Conv, Dense, Dense

model = tf.keras.Sequential()
# TODO: Import GloVe embeddings
# TODO: Create validation set
model.add(tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, input_length = median_words_per_sample))
model.add(tf.keras.layers.SeparableConv1D())
model.add(tf.keras.layers.SeparableConv1D())
model.add(tf.keras.layers.SeparableConv1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense())
model.add(tf.keras.layers.Dense())


## Train the Model

model.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["acc"])
model.summary()
history = model.fit()