In [None]:
!pip install -r ml_requirements.txt

In [None]:
!python -m spacy download en_core_web_sm

# Set Up Environment

In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict
import itertools
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import norm
import spacy
import string

from gensim.models import Word2Vec
import tqdm

import torch
import torch.nn as nn
import torch.optim as opt
import torch.utils.data as data
from torch.utils.tensorboard import SummaryWriter

In [2]:
# Use GOOGLE_APPLICATION_CREDENTIALS env variable to reference the service account key
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="Scoreboard-ML-be7ebac08a24.json"

from google.cloud import bigquery

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
np.random.seed(100)

# Create the Dataset

Skip if dataset has been created in storage

In [5]:
# Connect to BigQuery to run the query
bq_client=bigquery.Client()

# Download query results.
query_string = """
SELECT *
FROM `fh-bigquery.reddit_comments.2016_12`
WHERE body not in ('[deleted]','[removed]')
AND subreddit='cars'
"""

In [6]:
query_job=bq_client.query(query_string) # API Request

In [7]:
# Old Buckets
#custom_buckets = [-130,0,1,10,50,1500]
#labels=['Downvoted','No Votes','2-10 Votes','11-50 Votes','50+ Votes']

In [8]:
# Call the to_dataframe method on the reader to write the entire stream to a pandas DataFrame
rows_df=query_job.result().to_dataframe() # Waits for query to finish
new_df=rows_df.copy(deep=True)

In [9]:
new_df.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,"Hey Matt, I didn't get a chance to read the ar...",,,,gropingpriest,'09 Cobalt SS t/c,,1482193840,t5_2qhl2,t3_5j8dou,t3_5j8dou,-4,1483898510,1,0,dbeiwkd,cars,,,
1,"What, did it fall down behind the washing mach...",,,,ArmourChinker,Commodore ����,,1481770677,t5_2qhl2,t3_5ies4g,t3_5ies4g,-9,1483775228,0,0,db7ofxv,cars,,,
2,Says the guy with the rwd subaru.,,,,2az-fe,"13 BMW 535xi, 13 Toyota Highlander, 04 Toyota ...",,1482648961,t5_2qhl2,t3_5k52fn,t1_dblr1ml,25,1484029141,0,0,dblvijb,cars,,,
3,looks about right to me [red 570s](http://www....,,,,jebusv20,VW Polo GTi 1.4TSI,,1481520805,t5_2qhl2,t3_5huw99,t3_5huw99,75,1483664660,0,0,db372d8,cars,,,
4,What do you think the GT is going to be worth ...,,,,Spicy_Curry,2016 340xi ; 2017 GTI,,1482389580,t5_2qhl2,t3_5jova0,t3_5jova0,66,1483958547,0,0,dbhwl61,cars,,,


In [10]:
features = []
for body in tqdm.tqdm(new_df.body.tolist()):
    doc = nlp(body)
    features.append('|'.join([tok.lemma_.lower() for tok in doc]))
len(features)

100%|██████████| 108069/108069 [18:05<00:00, 99.58it/s] 


108069

In [11]:
new_df.score.describe()

count    108069.000000
mean          6.298948
std          23.469071
min        -125.000000
25%           1.000000
50%           2.000000
75%           5.000000
max        1358.000000
Name: score, dtype: float64

In [12]:
new_df = (
    new_df
    .assign(
        normalized_score = lambda x: x.score.apply(lambda x: norm.cdf(x, loc=6.830742, scale=23.973519)).round(2)*100,
        lemma_array = features,
    )
    .assign(
        body_length = lambda x: x.lemma_array.apply(lambda x: len(x.split("|")))
    )
)
new_df.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,...,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class,normalized_score,lemma_array,body_length
0,"Hey Matt, I didn't get a chance to read the ar...",,,,gropingpriest,'09 Cobalt SS t/c,,1482193840,t5_2qhl2,t3_5j8dou,...,1,0,dbeiwkd,cars,,,,33.0,"hey|Matt|,|-PRON-|do|not|get|a|chance|to|read|...",26
1,"What, did it fall down behind the washing mach...",,,,ArmourChinker,Commodore ����,,1481770677,t5_2qhl2,t3_5ies4g,...,0,0,db7ofxv,cars,,,,25.0,"what|,|do|-PRON-|fall|down|behind|the|washing|...",18
2,Says the guy with the rwd subaru.,,,,2az-fe,"13 BMW 535xi, 13 Toyota Highlander, 04 Toyota ...",,1482648961,t5_2qhl2,t3_5k52fn,...,0,0,dblvijb,cars,,,,78.0,say|the|guy|with|the|rwd|subaru|.,8
3,looks about right to me [red 570s](http://www....,,,,jebusv20,VW Polo GTi 1.4TSI,,1481520805,t5_2qhl2,t3_5huw99,...,0,0,db372d8,cars,,,,100.0,look|about|right|to|-PRON-|[|red|570s](http://...,21
4,What do you think the GT is going to be worth ...,,,,Spicy_Curry,2016 340xi ; 2017 GTI,,1482389580,t5_2qhl2,t3_5jova0,...,0,0,dbhwl61,cars,,,,99.0,what|do|-PRON-|think|the|GT|be|go|to|be|worth|...,48


In [13]:
new_df.to_csv(
    "parsed_dataset.tsv", 
    sep="\t", index=False
)
new_df.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,...,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class,normalized_score,lemma_array,body_length
0,"Hey Matt, I didn't get a chance to read the ar...",,,,gropingpriest,'09 Cobalt SS t/c,,1482193840,t5_2qhl2,t3_5j8dou,...,1,0,dbeiwkd,cars,,,,33.0,"hey|Matt|,|-PRON-|do|not|get|a|chance|to|read|...",26
1,"What, did it fall down behind the washing mach...",,,,ArmourChinker,Commodore ����,,1481770677,t5_2qhl2,t3_5ies4g,...,0,0,db7ofxv,cars,,,,25.0,"what|,|do|-PRON-|fall|down|behind|the|washing|...",18
2,Says the guy with the rwd subaru.,,,,2az-fe,"13 BMW 535xi, 13 Toyota Highlander, 04 Toyota ...",,1482648961,t5_2qhl2,t3_5k52fn,...,0,0,dblvijb,cars,,,,78.0,say|the|guy|with|the|rwd|subaru|.,8
3,looks about right to me [red 570s](http://www....,,,,jebusv20,VW Polo GTi 1.4TSI,,1481520805,t5_2qhl2,t3_5huw99,...,0,0,db372d8,cars,,,,100.0,look|about|right|to|-PRON-|[|red|570s](http://...,21
4,What do you think the GT is going to be worth ...,,,,Spicy_Curry,2016 340xi ; 2017 GTI,,1482389580,t5_2qhl2,t3_5jova0,...,0,0,dbhwl61,cars,,,,99.0,what|do|-PRON-|think|the|GT|be|go|to|be|worth|...,48


# Train Word2vec Model

In [14]:
if Path("parsed_dataset.tsv").exists():
    data_df = (
        pd.read_csv("parsed_dataset.tsv", sep="\t")
        .query("body_length > 1 and body_length < 301")
    )
    data_df.head()

In [18]:
if Path("reddit_vocab.model").exists():
    word_model = Word2Vec.load("reddit_vocab.model")
else:
    word_model = (
        Word2Vec(
            list(
                map(
                    lambda x: x.split('|') if len(x) > 0 else '', 
                    data_df.lemma_array.tolist()
                )
            ),
            size=100, 
            iter=1
        )
    )
    word_model.save("reddit_vocab.model")

KeyboardInterrupt: 

In [21]:
vocab_length = len(word_model.wv.vocab)
vocab_length

15135

In [22]:
vocab_mapper = {
    word:pos
    for pos, word in enumerate(word_model.wv.vocab, start=2)
}
print(list(vocab_mapper.keys())[0:4])

hey


In [23]:
import pickle
pickle.dump(vocab_mapper, open("storage/transformer/model_vocab.pkl", "wb"))

In [25]:
correct_vocab = [
    word_model.wv[word[0]]
    for word in sorted(vocab_mapper.items(), key=lambda x: x[1])
]

In [26]:
fill_vectors = [
  np.zeros(100),
  np.random.uniform(
      np.stack(correct_vocab).min(),
      np.stack(correct_vocab).max(),
      size=100,
  )
]

In [27]:
correct_vocab = np.stack(fill_vectors + correct_vocab)
print(correct_vocab.shape)
correct_vocab

(15137, 100)


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.75489497e+00, -4.05982283e+00, -3.01992025e-01, ...,
         5.26690822e+00, -6.07171475e+00,  5.69176936e+00],
       [-1.00234616e+00, -5.83511412e-01,  4.95817602e-01, ...,
         7.56369710e-01,  8.26566219e-01,  1.92568326e+00],
       ...,
       [ 1.65504724e-01,  3.61217968e-02,  2.61429191e-01, ...,
        -3.69425416e-01,  5.30689619e-02, -2.24623859e-01],
       [-3.18039656e-02,  1.26783118e-01, -1.26064330e-01, ...,
        -2.41369940e-03,  4.62743193e-01, -6.72298297e-02],
       [ 4.77407664e-01, -1.92143098e-01,  2.03233689e-01, ...,
         5.50392494e-02,  5.57754159e-01, -6.53721020e-02]])

# Train the Model

## Create the Validation Set

In [28]:
score_groups = data_df.normalized_score.value_counts()
score_groups[score_groups > 500]

40.0     40815
42.0     18365
44.0     10046
45.0      4331
47.0      4154
49.0      3133
39.0      2768
50.0      2463
52.0      1855
54.0      1495
100.0     1342
55.0      1306
57.0      1085
37.0       940
59.0       887
60.0       762
62.0       701
63.0       557
65.0       533
Name: normalized_score, dtype: int64

In [29]:
validation_set = (
    data_df
    .query(f"normalized_score in {score_groups[score_groups > 500].index.tolist()}")
    .groupby("normalized_score")
    .apply(lambda x: x.sample(min(len(x)-2, 100), random_state=100))
    .reset_index(drop=True)
)
print(validation_set.shape)
validation_set.normalized_score.value_counts()

(1900, 23)


57.0     100
50.0     100
39.0     100
40.0     100
42.0     100
44.0     100
45.0     100
47.0     100
49.0     100
52.0     100
65.0     100
54.0     100
59.0     100
60.0     100
62.0     100
63.0     100
100.0    100
55.0     100
37.0     100
Name: normalized_score, dtype: int64

In [30]:
test_set = (
    data_df
    .query(f"link_id not in {validation_set.link_id.tolist()}")
    .query(f"normalized_score in {score_groups[score_groups > 500].index.tolist()}")
    .groupby("normalized_score")
    .apply(lambda x: x.sample(min(len(x), 100), random_state=100))
    .reset_index(drop=True)
)
print(test_set.shape)
test_set.normalized_score.value_counts()

(1797, 23)


57.0     100
50.0     100
39.0     100
40.0     100
42.0     100
44.0     100
45.0     100
47.0     100
49.0     100
52.0     100
54.0     100
59.0     100
60.0     100
55.0     100
37.0     100
62.0      98
63.0      85
65.0      67
100.0     47
Name: normalized_score, dtype: int64

In [31]:
training_set = (
    data_df
    .query(f"link_id not in {validation_set.link_id.tolist()}")
    .query(f"link_id not in {test_set.link_id.tolist()}")
)
print(training_set.shape)
training_set.normalized_score.value_counts()

(12444, 23)


40.0    7640
42.0    2312
44.0    1003
45.0     315
47.0     280
39.0     243
49.0     200
50.0     116
52.0      67
54.0      38
55.0      28
36.0      27
37.0      22
57.0      18
59.0      16
66.0      14
33.0      13
34.0      12
68.0      10
71.0       8
69.0       8
30.0       7
74.0       6
31.0       6
72.0       5
28.0       5
75.0       4
81.0       2
78.0       2
79.0       2
96.0       2
25.0       2
76.0       2
89.0       2
60.0       1
87.0       1
94.0       1
24.0       1
6.0        1
80.0       1
88.0       1
Name: normalized_score, dtype: int64

In [32]:
df_map = {
    "train":training_set,
    "validation": validation_set,
    "test":test_set
}

In [33]:
Y_map = {
    label:torch.FloatTensor(df_map[label].normalized_score.tolist())
    for label in df_map
}

In [34]:
X_map = {
    label:[
      list(
          map(
              lambda tok: vocab_mapper[tok] if tok in vocab_mapper else 1, 
              comment_lemma.split("|")
          )
      ) 
      for comment_lemma in df_map[label].lemma_array.tolist()
    ]
    for label in df_map
}

In [35]:
max_length = max([
    max(list(map(lambda x: len(x), X_map[dataset])))
    for dataset in X_map
])
print(max_length)
max_length = max(max_length, 300)
print(max_length)

299
300


In [36]:
X_map = {
    label:torch.stack([
        torch.LongTensor(np.pad(data, (0,max_length-len(data)), 'constant', constant_values=0))
        for data in X_map[label]
    ])
    for label in X_map
}

In [37]:
data_loader_map = {
    label:data.DataLoader(
        data.TensorDataset(X_map[label], Y_map[label]), 
        batch_size=128,
        shuffle= (label=='train')
    )
    for label in X_map
}

In [38]:
torch.cuda.device_count()

1

In [41]:
learning_rate_grid = [0.1, 1e-3, 1e-5]
epochs_grid = [50]
dropout_grid = [
    [0.25, 0.25, 0.25, 0.25, 0.25],
    [0.5, 0.5, 0.5, 0.5, 0.5],
    [0.75, 0.75, 0.75, 0.75, 0.75]
]
#loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.MSELoss()

In [45]:
# Final best Parameters
dropout_grid = [[0.25, 0.25, 0.25, 0.25, 0.25]]
learning_rate_grid = [1e-3]
epochs_grid = [100]#, 200]

In [48]:
from ml_model import RedditTransformerPredictor
training_grid = itertools.product(epochs_grid, learning_rate_grid, dropout_grid)
for epoch_param, lr_param, dropout_param in training_grid:

    predictor_model = RedditTransformerPredictor(
        torch.FloatTensor(correct_vocab), 
        max_length=max_length,
        vocab_dim=100,
        dropout=dropout_param
    )
    
    writer = SummaryWriter(
        "storage/param_sweep/transformer/"
        f"{epoch_param, str(lr_param), ','.join(map(str,dropout_param)), 2}"
    )
    optimizer = opt.Adam(predictor_model.parameters(), lr=lr_param)
    
    for epoch in tqdm.tqdm(range(epoch_param+1)):
        
        if torch.cuda.is_available():
            predictor_model.cuda()
        
        # Check to make sure the network starts off as random
        if epoch > 0:
            epoch_loss = []
            
            for batch in data_loader_map['train']:

                if torch.cuda.is_available():
                    batch[0] = batch[0].cuda()

                optimizer.zero_grad()
                prediction = predictor_model(batch[0]).cpu()
                loss = loss_fn(prediction, batch[1].unsqueeze(dim=1))
                epoch_loss.append(loss.item())
                loss.backward()
                optimizer.step()

            writer.add_scalar("Loss/train", np.mean(epoch_loss), epoch)

        # Set model to evaluation
        predictor_model.eval()
        
        val_loss = []
        for batch in data_loader_map['validation']:
            
            if torch.cuda.is_available():
                batch[0] = batch[0].cuda()
            
            prediction = predictor_model(batch[0])
            loss = loss_fn(prediction.cpu(), batch[1].unsqueeze(dim=1))
            val_loss.append(loss.item())
            
        
        writer.add_scalar("Loss/val", np.mean(val_loss), epoch)

        # Set model for training
        predictor_model.train()

    writer.close()

100%|██████████| 101/101 [49:41<00:00, 29.52s/it]


In [49]:
predictor_model.eval()
test_loss = []
for batch in tqdm.tqdm(data_loader_map['test']):

    if torch.cuda.is_available():
        batch[0] = batch[0].cuda()

    prediction = predictor_model(batch[0])
    loss = loss_fn(prediction.cpu(), batch[1].unsqueeze(dim=1))
    test_loss.append(loss.item())

100%|██████████| 15/15 [00:01<00:00, 10.48it/s]


In [50]:
print(np.mean(test_loss))

565.4683766682942


In [51]:
torch.save(
    predictor_model.state_dict(), 
   "storage/transformer/reddit_transformer_predictor.pth"
)

In [52]:
#list(predictor_model.named_parameters())
predictor_model

RedditTransformerPredictor(
  (input_layer): Embedding(15137, 100)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
    )
    (linear1): Linear(in_features=100, out_features=50, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=50, out_features=100, bias=True)
    (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=50, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
       