## Preprocess Data

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import json

torch.manual_seed(22)

<torch._C.Generator at 0x107271f70>

In [22]:
class BlogPostDataset(data.Dataset):
    def __init__(self, data_root_path, json_file_name):
        """
        Args:
            data_root_path (string): directory where all the data files exist
            json_file_name (sring): name of the specific JSON file to be represented by this class
        """
        self.data_root_path = data_root_path
        with open(self.data_root_path + json_file_name) as r:
            self.json_data = json.load(r)
    
    def __len__(self):
        return len(self.json_data)
    
    def __getitem__(self, idx):
        return self.json_data[idx]

In [23]:
data_dir = "data/blogs/json-data/"
train_file_name = "train.json"
test_file_name = "test.json"

training_set = BlogPostDataset(data_dir, train_file_name)

In [24]:
# Map each word to a unique int value
word_to_int = {}
for instance in training_set:
    for word in instance["post"].split(" "):
        if word not in word_to_int:
            word_to_int[word] = len(word_to_int)

In [25]:
print(word_to_int["Kulkarni"])

878109


## Word Embeddings

In [28]:
embeddings = nn.Embedding(len(word_to_int.keys()), 32)
lookup_tensor = torch.tensor([word_to_int["yay"]], dtype = torch.long)
embed = embeddings(lookup_tensor)
print(embed)

tensor([[-1.6563, -0.8292,  0.3843, -1.1768, -0.2160,  1.0301, -1.1762,
          1.7117, -0.0945, -0.5375,  1.5506,  0.9510,  0.6132, -0.3007,
          0.5378,  2.3545, -1.8604, -1.1958, -0.5106,  0.6839,  0.3718,
         -0.6796,  0.7938,  1.3786,  1.5075, -0.1230,  0.0310,  1.6010,
          0.6027, -1.7493,  1.6844, -0.3365]])


## First Model

In [None]:
class BasicLSTMAgeClassifier(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim, age_groups_count):
        super(BasicLSTMAgeClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # This LSTM takes word embeddings as inputs and ouputs hidden states with
        # dimensionality hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2group = nn.Linear(hidden_dim, age_groups_count)
        self.hidden = self.init_hidden()
    
    def init_hidden():
        # Since there is no hidden state yet at the beginning, we start off with a zero-tensor
        # for the state
        # Semantics of the axes are (num_layers, batch_size, hidden_dim)
        num_layers = 1
        batch_size = 50
        return (torch.zeros(num_layers, batch_size, self.hidden_dim),
                torch.zeros(num_layers, batch_size, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        group_space = self.hidden2group(lstm_out.view(len(sentence), -1))
        group_scores = F.log_softmax(group_space, dim = 1)
        return group_scores
        
        
        
        
        
        
        
        
        