Prepare entity-tagged Reddit posts into features for disease prediction. Walk through `reddit_validation_embed.ipynb` first.

---

In [74]:
import spacy
nlp = spacy.load('en_core_web_lg')
import pandas as pd
import sys
import random
import pickle
import numpy as np
import pandas as pd
import ast

In [75]:
# # included for convenience to help find correct paths
# import os
# os.getcwd()
# os.listdir("..")

Constants.

In [76]:
# paths
MEDRED_REPRODUCIBLE_DIR = "../"
SYMPTOMS_IN = MEDRED_REPRODUCIBLE_DIR + "data/validation/Reddit/NER_Reddit_pred_dis.csv" # sym_file
DRUGS_IN = MEDRED_REPRODUCIBLE_DIR + "data/validation/Reddit/NER_Reddit_pred_drug.csv" # drug_file
# are features for the DL implementation or MetaMap?
EMBEDDING_TYPE = "DL"
# output path
FEATURES_OUT = MEDRED_REPRODUCIBLE_DIR + "data/validation/Reddit/" + EMBEDDING_TYPE + "_embedded_features.pckl"
# test run? T/F
IS_SAMPLE_RUN = False



# sym_file = "data/entities/{}/{}_symptom_mappings.csv".format(etype, etype)
# drug_file = "data/entities/{}/{}_drugs_mappings.csv".format(etype, etype)
# features_file = "data/features/{}_embdedded_features{}.pckl".format(etype, sample)

Read in data.

In [77]:
all_sr = ['bpd', 'cfs','crohnsdisease', 'dementia',  'depression',\
                    'diabetes', 'dysautonomia', 'gastroparesis','hypothyroidism', 'ibs', \
                    'interstitialcystitis', 'kidneystones', 'menieres', 'multiplesclerosis',\
                    'parkinsons', 'psoriasis', 'rheumatoid', 'sleepapnea']

sym = pd.read_csv(SYMPTOMS_IN) # ,subreddit,matched,UID,norm_UID,post_index,score
drug = pd.read_csv(DRUGS_IN) # ,post_index,subreddit,matched,norm_UID,UID,score

if IS_SAMPLE_RUN:
    df = sym.append(drug).sample(n=1000, random_state=7)
else:
    df = sym.append(drug)
print ("Entity-tagged posts loaded:", len(df))

Entity-tagged posts loaded: 446202


Prepare features.

In [78]:
def embedding_from_tokens(row):
    '''
    Get average embedding for all tokens in a post
    '''
    # get list of term-tagged tokens
    tokens = nlp(row)
    # init
    vec = []
    
    # removed, since not used anywhere else; scalar to halve the mean? why?
    #   word_emb_len initally set to 300, matches # embeddings per post
    #   set as a constant right above original function - a parameter from data prep?
    # vec.append(np.zeros(word_emb_len))

    # for each token, append a list of its embeddings
    for token in tokens:
        if token.has_vector:
            vec.append(token.vector)
    # then make a list of mean embedding values fo reach token
    vec = np.mean(vec, axis=0)
    # handlign for no embedding cases
    if isinstance(vec, np.ndarray):
        vec = vec.tolist()
    else:
        vec = []
    return vec

Define feature preparation code for different cutoffs of embedding fit. Fit one set at 0.9, as the value used in the original work is not specified. Datasets based on other values (as seen in the original feature preparation code) are derived after.

Runtime for this one ~12 minutes.

In [79]:
def save_features_with_certainty(df, certainty, features_file=FEATURES_OUT, embedding_type=EMBEDDING_TYPE):
    '''
    Process features at different score (certainty) cutoffs.
    '''
    raw_features = df[["subreddit", "matched", "post_index", "score"]]
    raw_features = raw_features[ (raw_features["score"].astype(float) > certainty) ]
    # If no values above cutoff, stop early - nothign to write
    if not len(raw_features):
        return

    # additional cleanup based on NER model type
    if embedding_type == "DL":
        #raw_features['matched'] = raw_features['matched'].apply(','.join)
        pass
    elif embedding_type == "MM":
        raw_features['matched'] = raw_features['matched'].apply(ast.literal_eval)
        raw_features['matched'] = raw_features['matched'].apply(' '.join)
    else:
        print ("Non-existent entitiy type, please try again. ")
        sys.exit()

    # format, aggregating entities to post level (comma separated)
    raw_features = raw_features.rename(columns={'matched':'entities'})
    #   format, aggregating only when there are any tags
    def join_entities(df):
        if df.shape[0] > 1:
            df = df.dropna(inplace=False)
        return ', '.join(df)
    raw_features = raw_features.groupby(['post_index','subreddit'])['entities'].apply(join_entities).reset_index()
    raw_features = raw_features.drop(columns=['post_index'])
    print("Total posts with entities over threshold", len(raw_features))
    
    # append file name with certainty level
    features_file = features_file.replace(".pckl", "_{:.2f}.pckl".format(certainty))

    # cast, in order to add vectors to cells
    object_features = raw_features.astype(object)

    # attach embeddings
    object_features['vec'] = object_features['entities'].apply(embedding_from_tokens)
    embedding_vec_list = object_features['vec'].tolist()
    embedding_vec_list = pd.DataFrame(embedding_vec_list)
    features = object_features.copy()

    # tag post with associated disease based on subreddit
    disease_values_dict = {el:i for i, el in enumerate(all_sr)}
    # # these will be used to take disease names for each prediction task
    # disease_names = list(disease_values_dict.keys())
    # disease_labels = list(disease_values_dict.values())
    s = pd.DataFrame()
    s['disease'] = features.apply(lambda x: disease_values_dict[x['subreddit']], axis=1)

    features = features.join(s)
    features.to_pickle(features_file)
    return features

out = save_features_with_certainty(df, 0.9, features_file=FEATURES_OUT, embedding_type=EMBEDDING_TYPE)
out

Total posts with entities over threshold 136480


  return _methods._mean(a, axis=axis, dtype=dtype,


Unnamed: 0,subreddit,entities,vec,disease
0,diabetes,CGM,"[-0.36256998777389526, -0.002186200115829706, ...",5
1,depression,"depression, depression","[-0.170890673995018, 0.7456066608428955, 0.023...",4
2,bpd,"BPD, BPD","[-0.20788399875164032, 0.8302733302116394, -0....",0
3,depression,"depression, depression, depression, depression","[-0.15829943120479584, 0.7350971102714539, -0....",4
4,depression,broken arm,"[-0.041669994592666626, -0.1625255048274994, -...",4
...,...,...,...,...
136475,ibs,IBS,"[-0.645550012588501, 1.003600001335144, -0.554...",9
136476,bpd,"paranoid delusions, paranoia","[-0.29793548583984375, -0.1916699856519699, 0....",0
136477,depression,"clonazepam, clonazepam","[-0.13348400592803955, 0.420906662940979, -0.1...",4
136478,depression,stress,"[-0.5926600098609924, 0.8686400055885315, -0.2...",4


Derive the rest. Lower iterations will take longer, since less terms will be filtered out at lower certainty thresholds.

In [80]:
# 5-10 minutes each
for certainty in np.linspace(0.1,1,9, endpoint=False):
    save_features_with_certainty(df, certainty, features_file=FEATURES_OUT, embedding_type=EMBEDDING_TYPE)

Total posts with entities over threshold 171000


  return _methods._mean(a, axis=axis, dtype=dtype,


Total posts with entities over threshold 170998
Total posts with entities over threshold 170841
Total posts with entities over threshold 170088
Total posts with entities over threshold 168303
Total posts with entities over threshold 164717
Total posts with entities over threshold 159297
Total posts with entities over threshold 149941
Total posts with entities over threshold 136480
