### Downloading reddit comments

In [None]:
from RWV.pushshift.get_data_threading import GetContent, timestamp_to_utc

In [None]:
def get_comments(after, before, subreddit, save_file_name):
    comments = GetContent(after=after, before=before, subreddit=subreddit, content='comment', 
                          thread_num=4, max_per_sec=1, log_level='info')
    comments.get_content()
    comments.save_content(save_file_name)

In [None]:
import time

days = 7
delta_t = 24*60*60

now_time = time.time()
t2 = now_time

for day in range(days):
    t1 = now_time - (day + 1)*delta_t
    
    print('getting comments from {} to {}'.format(timestamp_to_utc(t1), timestamp_to_utc(t2)))
    
    get_comments(t1, t2, 'askreddit', 'askreddit_{}'.format(day+1))
    t2 = t1

### Checking what was downloaded

In [None]:
import os

path = os.path.abspath('') + '/RWV/data/reddit_data'
file_lst = os.listdir(path)

saved_reddit = []
for file in file_lst:
    if file != '.gitignore':
        saved_reddit.append(file)

saved_reddit

In [None]:
from RWV.pushshift.load_data import Content

def load_saved(file_name):
    f = Content(file_name)
    comments = f.load_comments()     # load comments as list of Comment objects
    return comments

comments = load_saved(saved_reddit[0])

In [None]:
small_comments = load_saved(saved_reddit[0])[0:5]
for comment in small_comments:
    print('author: {}\ntext: {}\ntime: {}\n\n'.format(comment.author,
                                                      comment.body, timestamp_to_utc(comment.created_utc)))

In [None]:
from RWV.text_processing.process_reddit import word2vec_input, count_words

w2v_input = word2vec_input(comments)

print('word2vec input example:\n', word2vec_input(small_comments), '\n')

word_count = count_words(w2v_input)
most_common = sorted(word_count, key=word_count.get, reverse=True)[:20]

for w in most_common:
    print('{}: {}'.format(w, word_count[w]))

count = 0
for c in w2v_input:
    count += len(c)

print('\ntotal sentences: {}, total words: {}'.format(len(w2v_input), count))

## Vector models
This is just an example how things work and that they work. Testing was done only with 2 days worth of r/askreddit data with min_count=50 which is to big for this small example (lots of "common" missing words).

### Word2vec model

In [None]:
from RWV.vector_model.word_embeddings import WordEmbedding

In [None]:
start = time.time()

saved_reddit = [saved_reddit[0], saved_reddit[2]]  # use only 2 days of data

w2v_model = WordEmbedding(model_type='word2vec')
w2v_model.make_model(sentences=saved_reddit, content='comment', epochs=5, size=300, window=5,
                     min_count=50, workers=8)
w2v_model.save_model('w2v_askreddit.kv')

print('finished in: {}'.format(time.time() - start))

# finished in: 377.2292971611023
# file size: 15.4 MB

### FastText model

In [None]:
start = time.time()

saved_reddit = [saved_reddit[0], saved_reddit[2]]

ft_model = WordEmbedding(model_type='fasttext')
ft_model.make_model(sentences=saved_reddit, content='comment', epochs=5, size=300, window=5,
                     min_count=50, bucket=1000000, workers=8)
ft_model.save_model('ft_askreddit.kv')

print('finished in: {}'.format(time.time() - start))

# finished in: 455.5157127380371
# file size: 1.2 GB  ->  reduce vector size or buckets for smaller file size

### Doc2vec model

In [None]:
start = time.time()

saved_reddit = [saved_reddit[0], saved_reddit[2]]

ft_model = WordEmbedding(model_type='doc2vec')
ft_model.make_model(sentences=saved_reddit, content='comment', epochs=5, size=300,
                     window=5, min_count=50, workers=8)
ft_model.save_model('d2v_askreddit.kv')

print('finished in: {}'.format(time.time() - start))

# finished in: 566.7566244602203
# file size: 15.4 MB

### Testing word vector models
https://radimrehurek.com/gensim/models/keyedvectors.html

In [None]:
w2v = WordEmbedding('word2vec').load_model('w2v_askreddit.kv')
ft = WordEmbedding('fasttext').load_model('ft_askreddit.kv')
d2v = WordEmbedding('doc2vec').load_model('d2v_askreddit.kv')

models = [w2v, ft, d2v]

In [None]:
for m in models:
    print(m.vocab['cat'].count)
    print(len(m.vocab))
    print(type(m['cat']))
    print(m.similarity('cat', 'dog'))
    print(m.most_similar(['cat'], topn=3))
    print(m.most_similar(positive=['car', 'cat'], topn=3))
    print(m.doesnt_match(['fire', 'water', 'cat']))
    print(m.n_similarity(['cat', 'bird'], ['dog', 'fish']))
    print(m.wmdistance(['cat', 'bird'], ['dog', 'fish']))
    print('\n')