In [1]:
import pickle, tqdm, time, os, tensorflow_hub as hub, numpy as np, pandas as pd
from pymongo import MongoClient
from pprint import pprint

In [2]:
def getMongoUri(auth_filename='auths/mongo_uri.pkl'):
    with open(auth_filename, 'rb') as file:
        pwd = pickle.load(file)
    return pwd['mongo_uri']

def createMongoClient(db='aita'):
    mongo_uri = getMongoUri()
    client = MongoClient(mongo_uri)
    db = client.aita
    return client, db

In [3]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed(['testing'])
print('model cached')

model cached


In [4]:
client, db = createMongoClient()
postsCollection = db.posts
NUM_DOCS = postsCollection.count_documents({})

In [5]:
allPosts = []
for post in tqdm.tqdm_notebook(postsCollection.find(projection={'title': 1, 'id': 1, 'selftext':1}), total=NUM_DOCS):
    allPosts.append(post)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/682771 [00:00<?, ?it/s]

In [6]:
df = pd.DataFrame(allPosts)

In [7]:
df = df.drop('_id', axis=1).copy()

In [8]:
df.isna().sum()

id              1
selftext    14189
title           2
dtype: int64

In [9]:
df.shape

(682771, 3)

In [10]:
df.head()

Unnamed: 0,id,selftext,title
0,j1f7am,I’m a 25 yr old female and my husband is 25. W...,AITA for not talking to my mom and not letting...
1,j1f71q,This happened last night and I am still proces...,AITA for telling a women I am not interested a...
2,j1f6tt,I (19 M) have one of my busiest semester this ...,AITA for giving up on a friend who has been di...
3,j1f6r4,[removed],Will I be the asshole if I drop a metaphorical...
4,j1f4so,[removed],AITA if I only want to have sex with my girlfr...


In [11]:
# drop all entries which have a null value in any of id, titles, texts
df = df.dropna().copy()
df = df.reset_index(drop=True).copy()

In [12]:
df['selftext'].value_counts().head()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [13]:
df['title'].value_counts().head()

AITA                                                                      871
AITA?                                                                     283
Am I the asshole?                                                         100
WIBTA                                                                      88
AIs from AI Dungeon 2 to sexy to funny and one based wholly on Reddit!     79
Name: title, dtype: int64

In [14]:
#remove entries that have deleted or removed text
df = df[(df['selftext'] != '[removed]') & (df['selftext'] != '[deleted]')].copy()

In [15]:
df.shape

(490330, 3)

In [16]:
NUM_DOCS = len(df)

In [17]:
titles = df['title'].values

In [18]:
# calculates end index for a particular iteration for looping through documents in batches
def calcEndIdx(start_idx, batch_size, ndocs):
    end_idx = start_idx + batch_size
    end_idx = ndocs if end_idx > ndocs - 1 else end_idx
    return end_idx

In [23]:
# convert text to embeddings in batches (model can handle multiple texts at once)
# batch size depends on compute power
embeddings = [] # empty array to store embeddings as we iterate through docs
BATCH_SIZE = 1000

for start_idx in tqdm.tqdm_notebook(range(0, NUM_DOCS, BATCH_SIZE)):
    end_idx = calcEndIdx(start_idx, BATCH_SIZE, NUM_DOCS)
    curr_embeddings = embed(titles[start_idx:end_idx]).numpy()
    embeddings.append(curr_embeddings)
    
embeddings = np.concatenate(embeddings) # convert batched arrays to shape (N, Vector Size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/491 [00:00<?, ?it/s]

In [32]:
clean_records = df.to_dict(orient='records')

In [33]:
for i,record in tqdm.tqdm_notebook(enumerate(clean_records), total=NUM_DOCS):
    record['vector'] = embeddings[i].tolist()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/490330 [00:00<?, ?it/s]

In [34]:
cleanPostsCollection = db.clean.posts

In [35]:
cleanPostsCollection.insert_many(clean_records)

<pymongo.results.InsertManyResult at 0x7ff6be26ac08>