In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import glob
import os
import ast # converts string of list -> list
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy 
from scipy.spatial.distance import cosine

nlp = spacy.load('en', disable=['parser', 'ner'])

from spacy.tokens import Doc

def custom_tokenizer(tokens):
    # with this:
    return Doc(nlp.vocab, tokens)

nlp.tokenizer = custom_tokenizer


In [2]:
################# MODEL PARAMETERS #####################
path_we = "/home/adrian/PhD/Data/FastText_embedding_20190703/ft_wordembeddings_dim300_minCount5_URL-User-toConstant_iter10_20190703"
path_we = "/Users/adrianahne/PhD/Data/FastText_embeddings/ft_wordembeddings_dim300_minCount5_URL-User-toConstant_iter10_20190703"
dataPath = "result_cause_effect_prediction"
csv_files = glob.glob(os.path.join(dataPath, "*.csv"))
len(csv_files)
stopword_list = [word for word in stopwords.words('english') if word not in ["no", "nor", "not", "don", "don't", 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]]



In [3]:

################## LOAD DATA ######################
tuples = []
for file in csv_files:
    with open(file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i == 0: # header
                if line.endswith("\n"):
                    line = line[:-2]
                header = line.split(",")[1:]
            else:
                index, ll = line.split(",", 1)
                ll, io_tags = ll.rsplit("[", 1)
                if io_tags.endswith('\n'):
                    io_tags = io_tags[:-2] # remove \n
                elif io_tags.endswith('"'):
                    io_tags = io_tags[:-1]
                io_tags = ast.literal_eval("["+io_tags)

                text, tokenized = ll.rsplit(',"[', 1)
                tokenized = tokenized[:-3].replace('""', '"')
                tokenized = ast.literal_eval("["+tokenized)

                if text.endswith(',"'):
                    text = text[:-2]
                if text.startswith('"'):
                    text = text[1:]
                if text.endswith('"'):
                    text = text[:-1]

                tuples.append((index, text, tokenized, io_tags))
            
print(len(tuples))

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(tuples, columns=["index","text", "tokenized", "io_tags"])
df.head()

In [27]:
def extract_causes_effects(row):
    """ Extracts causes and effects from the io_tags column and saves them in new columns """
    #print()
    #print(row["text"])
    #for tag, tok in zip(row["io_tags"], row["tokenized"]):
    #    print(tok, tag)
    if len(row["tokenized"]) != len(row["io_tags"]):
        print("ERROR: Tokenized and tags are not of same length!")
    
    causes = ""
    effects = ""
    last_tag_cause_index = -2
    last_tag_effect_index = -2
    for i, tag in enumerate(row.io_tags):
        if tag == "I-C":
            if i == last_tag_cause_index + 1: # True, if cause consist of several consecutive words
                causes += ","+row["tokenized"][i]
            else: # cause is a new separate cause
                causes += ";"+row["tokenized"][i]
            last_tag_cause_index = i # helps to test if causes and effects consist of CONSECUTIVE words

        elif tag == "I-E":
            if i == last_tag_effect_index + 1: # True, if cause consist of several consecutive words
                effects += ","+row["tokenized"][i]
            else: # cause is a new separate cause
                effects += ";"+row["tokenized"][i]
            last_tag_effect_index = i # helps to test if causes and effects consist of CONSECUTIVE words
    
    if causes.startswith(",") or causes.startswith(";"):
        causes = causes[1:]
    if effects.startswith(",") or effects.startswith(";"):
        effects = effects[1:]        
    #print("causes:", causes)
    #print("effects:", effects)
    
    

    
    return pd.Series([row.index, row.text, row.tokenized, row.io_tags, causes, effects]
                     , index=["index", "text", "tokenized", "io_tags", "causes", "effects"])

df_with_causes_effects = df.apply(extract_causes_effects, axis=1)

In [29]:
df_with_causes_effects.head()

Unnamed: 0,index,text,tokenized,io_tags,causes,effects
0,"Index(['index', 'text', 'tokenized', 'io_tags'...",@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried
1,"Index(['index', 'text', 'tokenized', 'io_tags'...",Back to the #robotic life and I 'm good with t...,"[Back, to, the, #robotic, life, and, I, 'm, go...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, I-...","#libre,#freestylelibre,#abbott,#t1d,#diabetes,...",
2,"Index(['index', 'text', 'tokenized', 'io_tags'...",@USER I can't take such medications anymore ca...,"[@USER, I, can't, take, such, medications, any...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, O,...","diabetic,condition",
3,"Index(['index', 'text', 'tokenized', 'io_tags'...",If Brexit kills me because insulin is made in ...,"[If, Brexit, kills, me, because, insulin, is, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,
4,"Index(['index', 'text', 'tokenized', 'io_tags'...","cool thing here , im only not diabetic anymore...","[cool, thing, here, ,, im, only, not, diabetic...","[O, O, O, O, O, O, I-C, I-C, O, O, O, O, O, O,...","not,diabetic",


In [31]:
###################### STORE RESULTS FOR EASIER ACCESS #####################
#del df_with_causes_effects["index"]
#df_with_causes_effects.to_parquet("result_cause_effect_prediction_all.parquet")

In [3]:
########### LOAD DATA ###################
df_with_causes_effects = pd.read_parquet("result_cause_effect_prediction_all.parquet")
df_with_causes_effects.shape

(265328, 5)

In [4]:
#################### Only consider tweets with both cause and effect ###############################
df_new = df_with_causes_effects[(df_with_causes_effects.causes.str.len() > 0 ) & (df_with_causes_effects.effects.str.len() > 0)]
df_new.reset_index(drop=True, inplace=True)
print(df_new.shape)
df_new.head()

(96676, 5)


Unnamed: 0,text,tokenized,io_tags,causes,effects
0,@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried
1,someone 's a type II diabetic and they basical...,"[someone, 's, a, type, II, diabetic, and, they...","[O, O, O, I-C, I-C, I-C, O, O, O, I-E, I-E, I-...","type,II,diabetic","can't,eat,food"
2,"I would use it to fix my truck , keep bills pa...","[I, would, use, it, to, fix, my, truck, ,, kee...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",diabetes,"deformed,my,feet"
3,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,"get,rid,of,the,insulin;diet"
4,Just drunk that nasty ass shit for this glucos...,"[Just, drunk, that, nasty, ass, shit, for, thi...","[O, O, O, O, O, O, O, O, I-C, I-C, O, O, O, I-...","glucose,test","feel,sick,:nauseated_face:"


In [5]:
################## SPLIT CAUSES AND EFFECTS s.t. each row has one cause and one effect #######################
df_new = df_new.assign(causes=df_new["causes"].str.split(";")).explode("causes")
df_new = df_new.assign(effects=df_new["effects"].str.split(";")).explode("effects")
df_new.reset_index(drop=True, inplace=True)
print(df_new.shape)
df_new.head(20)

(149798, 5)


Unnamed: 0,text,tokenized,io_tags,causes,effects
0,@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried
1,someone 's a type II diabetic and they basical...,"[someone, 's, a, type, II, diabetic, and, they...","[O, O, O, I-C, I-C, I-C, O, O, O, I-E, I-E, I-...","type,II,diabetic","can't,eat,food"
2,"I would use it to fix my truck , keep bills pa...","[I, would, use, it, to, fix, my, truck, ,, kee...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",diabetes,"deformed,my,feet"
3,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,"get,rid,of,the,insulin"
4,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,diet
5,Just drunk that nasty ass shit for this glucos...,"[Just, drunk, that, nasty, ass, shit, for, thi...","[O, O, O, O, O, O, O, O, I-C, I-C, O, O, O, I-...","glucose,test","feel,sick,:nauseated_face:"
6,Me and my sister literally CRIED for him when ...,"[Me, and, my, sister, literally, CRIED, for, h...","[O, O, O, O, O, I-E, O, O, O, O, O, O, O, O, I...",diabetes,CRIED
7,@USER @USER I know people with severe depressi...,"[@USER, @USER, I, know, people, with, severe, ...","[O, O, O, O, O, O, O, I-E, O, O, O, I-C, I-C, ...","Type,2,diabetes",depression
8,I have gestational diabetes now with my pregna...,"[I, have, gestational, diabetes, now, with, my...","[O, O, I-C, I-C, O, O, O, I-C, O, O, O, I-E, I...","gestational,diabetes","stop,all,the,sugary,drinks"
9,I have gestational diabetes now with my pregna...,"[I, have, gestational, diabetes, now, with, my...","[O, O, I-C, I-C, O, O, O, I-C, O, O, O, I-E, I...",pregnancy,"stop,all,the,sugary,drinks"


In [6]:
########## load FastText vectors #####################
from gensim.models.fasttext import FastText
model = FastText.load(path_we)

In [6]:
# Take random causes + effects to cluster manually
#df_new.sample(n=1000, random_state=0).effects.values.tolist()

In [269]:
# remove stopwords
# lowercase
#manual_clusters = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causes_effects_clusters.xlsx")
manual_clusters = pd.read_excel("/Users/adrianahne/workspace/PhD/causality/Causal-associations-diabetes-twitter/data/Causes_effects_clusters.xlsx")
manual_clusters = manual_clusters[["Parent name", "Cluster name", "Synonyms"]].dropna(subset=["Cluster name"])
manual_clusters.reset_index(drop=True, inplace=True)

manual_clusters.head(20)

Unnamed: 0,Parent name,Cluster name,Synonyms
0,Diabetes,diabetes,"diabetic, #diabetic, #diabetes, diabetes melli..."
1,Diabetes,reverse diabetes,
2,Diabetes,T1D,"type 1 diabetes, type 1, #type1, #type1diabete..."
3,Diabetes,T2D,"type 2 diabetes, type 2, #type, #type2diabetes..."
4,Diabetes,diagnosis,
5,Diabetes,management,"control diabetes, uncontrol"
6,Insulin,insulin,"insulin hormone, supplies"
7,Insulin,rationing insulin,"shortage insulin, denying insulin, lack insuli..."
8,Insulin,unable to afford insulin,"can't afford insulin, no access to affordable ..."
9,Insulin,affordable insulin,afford insulin


In [120]:
def addCenterVector(row):
    """ calculates mean (center) vector cluster name and its synonyms """

    vectors = [np.array([model[word] for word in row["Cluster name"]]).mean(axis=0)] # initialise with center vector of cluster name
    #print("\nSynonyms:", row["Synonyms"], type( row["Synonyms"]))
    if not isinstance(row["Synonyms"], float) : # if not nan
        for synonym in row["Synonyms"].split(","):
            filtered_synonym = [word for word in synonym.strip().split(" ") if word not in stopword_list]
            #print("\t",filtered_synonym)
            vectors.append(np.array([model[word] for word in filtered_synonym]).mean(axis=0)) # add mean vector of each synonym (which can be "horrible vision")
            #print(len(vectors))
    cluster_center = np.array(vectors).mean(axis=0)
    return cluster_center

#manual_clusters["center"] = manual_clusters.apply(addCenterVector, axis=1)
#print(manual_clusters.shape)
#manual_clusters.head()

In [202]:
cosine_similarity(model["obesity"].reshape(1,-1), model["Obesity"].reshape(1,-1))

array([[0.5111686]], dtype=float32)

In [10]:
def lemma(syn):
    return [token.string.strip() if token.string.startswith("#") else token.lemma_.strip() for token in nlp(syn)]


def lemmatize(causeOrEffect, mode="causeEffect"):
    """ clusters = False : lemmatization + prep for causes and effects
        clusters = True: lemmatization + prep for clusters + synonyms (have different splits)
    """
    try:

        if isinstance(causeOrEffect, float) :
            return np.nan, np.nan
        else:
            if mode == "causeEffect":
                prep = preprocess(causeOrEffect.strip().split(","))
                causeOrEffect_lemma = lemma(prep)
            elif mode == "clusters":
                prep = preprocess(causeOrEffect.strip().split(" "))
                causeOrEffect_lemma = lemma(prep)
            elif mode == "cluster_synonyms":
                prep = [preprocess(syn.strip().split(" ")) for syn in causeOrEffect.strip().split(",")]
                causeOrEffect_lemma = [lemma(syn) for syn in prep]
            return prep, causeOrEffect_lemma

    except:
        print("causeOrEffect:", causeOrEffect)
        #print("prep:", prep)
    
def preprocess(phrase):
    if isinstance(phrase, float):
        return phrase
    prep = [word for word in phrase if word not in stopwords.words('english') if word != ""]
    return prep




In [11]:
df_new.head()

Unnamed: 0,text,tokenized,io_tags,causes,effects,cluster_cause,cluster_effect
0,@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried,,
1,someone 's a type II diabetic and they basical...,"[someone, 's, a, type, II, diabetic, and, they...","[O, O, O, I-C, I-C, I-C, O, O, O, I-E, I-E, I-...","type,II,diabetic","can't,eat,food",,
2,"I would use it to fix my truck , keep bills pa...","[I, would, use, it, to, fix, my, truck, ,, kee...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",diabetes,"deformed,my,feet",,
3,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,"get,rid,of,the,insulin",,
4,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,diet,,


In [12]:
df_new["causes_prep"], df_new["causes_lemma"] = zip(*df_new["causes"].map(lemmatize))
df_new["effects_prep"], df_new["effects_lemma"] = zip(*df_new["effects"].map(lemmatize))

df_new.head()

Unnamed: 0,text,tokenized,io_tags,causes,effects,cluster_cause,cluster_effect,causes_prep,causes_lemma,effects_prep,effects_lemma
0,@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried,,,"[shortages, insulin]","[shortages, insulin]",[worried],[worried]
1,someone 's a type II diabetic and they basical...,"[someone, 's, a, type, II, diabetic, and, they...","[O, O, O, I-C, I-C, I-C, O, O, O, I-E, I-E, I-...","type,II,diabetic","can't,eat,food",,,"[type, II, diabetic]","[type, II, diabetic]","[can't, eat, food]","[can't, eat, food]"
2,"I would use it to fix my truck , keep bills pa...","[I, would, use, it, to, fix, my, truck, ,, kee...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",diabetes,"deformed,my,feet",,,[diabetes],[diabetes],"[deformed, feet]","[deform, foot]"
3,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,"get,rid,of,the,insulin",,,[stress],[stress],"[get, rid, insulin]","[get, rid, insulin]"
4,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,diet,,,[stress],[stress],[diet],[diet]


In [14]:
manual_clusters = pd.read_excel("/Users/adrianahne/workspace/PhD/causality/Causal-associations-diabetes-twitter/data/Causes_effects_clusters.xlsx")
manual_clusters = manual_clusters[["Parent name", "Cluster name", "Synonyms"]].dropna(subset=["Cluster name"])
manual_clusters.reset_index(drop=True, inplace=True)


manual_clusters["cluster_name_prep"], manual_clusters["cluster_name_lemma"] = zip(*manual_clusters["Cluster name"].map(lambda x: lemmatize(x, mode="clusters")))
manual_clusters["synonym_prep"], manual_clusters["synonym_lemma"] = zip(*manual_clusters["Synonyms"].map(lambda x: lemmatize(x, mode="cluster_synonyms")))


manual_clusters.head(10)

Unnamed: 0,Parent name,Cluster name,Synonyms,cluster_name_prep,cluster_name_lemma,synonym_prep,synonym_lemma
0,Diabetes,diabetes,"diabetic, #diabetic, #diabetes, diabetes melli...",[diabetes],[diabetes],"[[diabetic], [#diabetic], [#diabetes], [diabet...","[[diabetic], [#diabetic], [#diabetes], [diabet..."
1,Diabetes,reverse diabetes,,"[reverse, diabetes]","[reverse, diabete]",,
2,Diabetes,T1D,"type 1 diabetes, type 1, #type1, #type1diabete...",[T1D],[t1d],"[[type, 1, diabetes], [type, 1], [#type1], [#t...","[[type, 1, diabetes], [type, 1], [#type1], [#t..."
3,Diabetes,T2D,"type 2 diabetes, type 2, #type, #type2diabetes...",[T2D],[T2D],"[[type, 2, diabetes], [type, 2], [#type], [#ty...","[[type, 2, diabetes], [type, 2], [#type], [#ty..."
4,Diabetes,diagnosis,,[diagnosis],[diagnosis],,
5,Diabetes,management,"control diabetes, uncontrol",[management],[management],"[[control, diabetes], [uncontrol]]","[[control, diabetes], [uncontrol]]"
6,Insulin,insulin,"insulin hormone, supplies",[insulin],[insulin],"[[insulin, hormone], [supplies]]","[[insulin, hormone], [supply]]"
7,Insulin,rationing insulin,"shortage insulin, denying insulin, lack insuli...","[rationing, insulin]","[ration, insulin]","[[shortage, insulin], [denying, insulin], [lac...","[[shortage, insulin], [deny, insulin], [lack, ..."
8,Insulin,unable to afford insulin,"can't afford insulin, no access to affordable ...","[unable, afford, insulin]","[unable, afford, insulin]","[[can't, afford, insulin], [access, affordable...","[[can't, afford, insulin], [access, affordable..."
9,Insulin,affordable insulin,afford insulin,"[affordable, insulin]","[affordable, insulin]","[[afford, insulin]]","[[afford, insulin]]"


In [15]:
######### WITH LEMMATIZATION => Create clusters #############

import warnings
warnings.filterwarnings("ignore") 


def calc_cosine_sim(causeOrEffect, causeOrEffect_lemma, cluster_name, cluster_name_lemma):
    """ Test several configurations (lowercased, without 'diabetes') to achieve highest cosine similarity """
    #if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #    print("\n\tA causeOrEffect:", causeOrEffect, " | causeOrEffect_lemma:", causeOrEffect_lemma, "  | cluster_name:", cluster_name, "  | cluster_name_lemma:", cluster_name_lemma)
    
     
    #s1 = time.time()
    #causeOrEffect_lemma = [token.string.strip() if token.string.startswith("#") else token.lemma_.strip() for token in nlp(causeOrEffect)]
    #global t1
    #t1 += time.time() - s1
    #print("\tcauseOrEffect_lemma:", causeOrEffect_lemma)
    
    s2 = time.time()
    causeOrEffect_vector = np.array([model[word] for word in causeOrEffect]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    global t2
    t2 += time.time() - s2
    #causeOrEffect_lowerCase_vector = np.array([model[word.lower()] for word in causeOrEffect]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    s3 = time.time()
    causeOrEffect_lemma_vector = np.array([model[word] for word in causeOrEffect_lemma]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    global t3
    t3 += time.time() - s3
    
    s4 = time.time()
    causeOrEffect_withoutDiabetes_vector = False
    if len(causeOrEffect) > 1:
        if "diabetic" in causeOrEffect:
            causeOrEffect.remove("diabetic")                
        if "diabetes" in causeOrEffect:
            causeOrEffect.remove("diabetes")
        causeOrEffect_withoutDiabetes_vector = np.array([model[word] for word in causeOrEffect]).mean(axis=0).reshape(1, -1)  # vector of identified cause
        #causeOrEffect_withoutDiabetes_lowerCase_vector = np.array([model[word.lower()] for word in causeOrEffect]).mean(axis=0).reshape(1, -1)  # vector of identified cause
        causeOrEffect_withoutDiabetes_lemma_vector = np.array([model[word] for word in causeOrEffect_lemma]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    global t4
    t4 += time.time() - s4

    #s5 = time.time()
    #cluster_name_lemma = [token.string.strip() if token.string.startswith("#") else token.lemma_.strip() for token in nlp(cluster_name)]
    #global t5
    #t5 += time.time() - s5
    #print("\tcluster_name_lemma:", cluster_name_lemma)
    s6 = time.time()
    cluster_name_vector = np.array([model[word] for word in cluster_name]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    global t6
    t6 += time.time() - s6
    #cluster_name_lower_vector = np.array([model[word.lower()] for word in cluster_name]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    s7 = time.time()
    cluster_name_lemma_vector = np.array([model[word] for word in cluster_name_lemma]).mean(axis=0).reshape(1, -1)  # vector of identified cause
    global t7
    t7 += time.time() - s7
    
    ll = []        
    
    s8 = time.time()
    sim = 1 - cosine(causeOrEffect_vector, cluster_name_vector)
    #if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #    print("\t\tB sim({}, {}) = ".format(causeOrEffect, cluster_name), sim)
    global t8
    t8 += time.time() - s8
    ll.append( (sim, causeOrEffect, 0) )
    
    #sim = cosine_similarity(causeOrEffect_lowerCase_vector, cluster_name_lower_vector)[0][0] # lowercase
    #ll.append( (sim, causeOrEffect) )
    
    sim = 1 - cosine(causeOrEffect_lemma_vector, cluster_name_lemma_vector)# lemmatization
    ll.append( (sim, causeOrEffect_lemma, 1) )
    #if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #    print("\t\tC sim({}, {}) = ".format(causeOrEffect_lemma, cluster_name_lemma), sim)

    
    if not isinstance(causeOrEffect_withoutDiabetes_vector, bool):
        sim = 1 - cosine(causeOrEffect_withoutDiabetes_vector, cluster_name_vector)
        ll.append( (sim, causeOrEffect, 2) )      
    #    if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #        print("\t\tD wo sim({}, {}) = ".format(causeOrEffect, cluster_name), sim)

        #sim = cosine_similarity(causeOrEffect_withoutDiabetes_lowerCase_vector, cluster_name_lower_vector)[0][0]
        #ll.append( (sim, causeOrEffect) )     
        
        sim = 1 - cosine(causeOrEffect_withoutDiabetes_lemma_vector, cluster_name_lemma_vector)
        ll.append( (sim, causeOrEffect_lemma, 3) )            
    #    if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #        print("\t\tE wo sim({}, {}) = ".format(causeOrEffect_lemma, cluster_name_lemma), sim)

    
    s9 = time.time()

    bestSim, bestCauseOrEffect, bestType = max(ll, key=lambda item:item[0])
    #if causeOrEffect[0] == "stressed" and cluster_name[0] == "stress":
    #    print("\t\tF BEST: ", bestCauseOrEffect, bestSim)
    global t9
    t9 += time.time() - s9
        
        
    if bestType == 0:
        global cc
        cc += 1
    elif bestType == 1:
        global cc_lemma
        cc_lemma += 1
    elif bestType == 2:
        global cc_wo_diab 
        cc_wo_diab += 1
    elif bestType == 3: 
        global cc_wo_diab_lemma
        cc_wo_diab_lemma += 1
        
    return bestSim, causeOrEffect#bestCauseOrEffect


def find_closest_cluster(cause_or_effect, cause_or_effect_prep, cause_or_effect_lemmatized, clusters, t1=None):
#causes = cause_or_effect.strip().split(";")
#causes_lemma = cause_or_effect_lemmatized.strip().split(";")
#for cause, cause_lemma in zip(causes, causes_lemma):
    similarities = []
#    filtered_cause = preprocess(cause.strip().split(","))
    if cause_or_effect_prep: # if non-empty
        for i, row in clusters.iterrows(): # loop over all clusters
            #try:
                if not (isinstance(row["Synonyms"], float) and np.isnan(row["Synonyms"]) ): # if there are synonyms
#                    for synonym in row["Synonyms"].split(","):
                    for synonym, synonym_lemma in zip(row["synonym_prep"], row["synonym_lemma"]):
                        #filtered_synonym = preprocess(synonym.strip().split(" "))
                        sim, causeOrEffect = calc_cosine_sim(cause_or_effect_prep, cause_or_effect_lemmatized, synonym, synonym_lemma)
                        similarities.append((causeOrEffect, synonym, row['Cluster name'], sim))
                      
    
                #filtered_cluster_name = preprocess(row["Cluster name"].strip().split(" "))
                sim, causeOrEffect = calc_cosine_sim(cause_or_effect_prep, cause_or_effect_lemmatized, row["cluster_name_prep"], row["cluster_name_lemma"])
                similarities.append((causeOrEffect, row["cluster_name_prep"], row['Cluster name'], sim))
                #if causeOrEffect[0] == "stressed" and row["cluster_name_prep"][0] == "stress":
                #    print("\t\t Y sim:", sim, "causeOrEffect:", causeOrEffect)
            #except Exception:
            #    print("\nERROR:")
            #    print("filtered_cause:", filtered_cause, "  | filtered_synonym: ", filtered_synonym)
            #    print("cause_or_effect:", cause_or_effect, "  | cause:", cause)


        bestCauseOrEffect, bestClusterSynonym, bestClusterName, bestSim = sorted(similarities, key=lambda tup: tup[3], reverse=True)[0] # take only highest sim
        #if causeOrEffect[0] == "stressed":

        #    for cause, synonym, cluster_name, simil in sorted(similarities, key=lambda tup: tup[3], reverse=True)[0:3]:
        #        print("\tsim({}, {}) = ".format(cause, synonym), simil, "of cluster name:", cluster_name)
        #    print("G RETURN: bestCauseOrEffect:", bestCauseOrEffect, " | bestClusterSynonym:", bestClusterSynonym, " | bestClusterName:", bestClusterName, " | bestSim:", bestSim)
        return bestCauseOrEffect, bestClusterSynonym, bestClusterName, bestSim
    else:
        return "", "", "", -1.0


import time


cc = 0
cc_wo_diab = 0
cc_lemma= 0
cc_wo_diab_lemma = 0
t1 = 0
t2 = 0
t3 = 0
t4 = 0
t5 = 0
t6 = 0
t7 = 0
t8 = 0
t9 = 0


start = time.time()
df_new["cluster_cause"] = ""
df_new["cluster_effect"] = ""
for i, row in df_new.iterrows():
    if i % 1000 == 0:
        print("\n\n ############# i:", i, "(", time.time()-start, "s)", " ##########\n\n")
    cause, cluster_synonym_cause, cluster_name_cause, sim_cause = find_closest_cluster(row["causes"], row["causes_prep"], row["causes_lemma"], manual_clusters)
    #print("\tcause:", cause, " | cluster_name_cause:", cluster_name_cause, " | sim_cause:", sim_cause)
    if sim_cause > 0.0:
        if sim_cause > 0.55: # associate found cluster to the cause; second condition if sim_cause == -1 returned
            df_new.loc[i, "cluster_cause"] = cluster_name_cause
        else: # create new cluster
            new_cluster_prep, new_cluster_lemma = lemmatize(" ".join(cause), mode="clusters")
            manual_clusters.loc[len(manual_clusters.index)] = ["", " ".join(cause), np.nan, new_cluster_prep, new_cluster_lemma, np.nan, np.nan] # Parent name, Cluster name, Synonym
            print("New clusters created (cause):", cause, "  | sim:", sim_cause, " | cluster_name:", cluster_name_cause)
            print("\t\tcause:", cause, " | cluster_name_cause:", cluster_name_cause, " | sim_cause:", sim_cause)

    effect, cluster_synonym_effect, cluster_name_effect, sim_effect = find_closest_cluster(row["effects"], row["effects_prep"], row["effects_lemma"], manual_clusters)    
    #print("\teffect:", effect, " | cluster_name_effect:", cluster_name_effect, " | sim_effect:", sim_effect)
    if sim_effect > 0.0:
        if sim_effect > 0.55: # associate found cluster to the cause; second condition if sim_cause == -1 returned
            df_new.loc[i, "cluster_effect"] = cluster_name_effect
        else: # create new cluster
            new_cluster_prep, new_cluster_lemma = lemmatize(" ".join(effect), mode="clusters")
            manual_clusters.loc[len(manual_clusters.index)] = ["", " ".join(effect), np.nan, new_cluster_prep, new_cluster_lemma, np.nan, np.nan] # Parent name, Cluster name, Synonym
            print("New clusters created (effect):", effect, "  | sim:", sim_effect, " | cluster_name:", cluster_name_effect)    
            print("\t\teffect:", effect, " | cluster_name_effect:", cluster_name_effect, " | sim_effect:", sim_effect)

#  new node for < 0.55
end = time.time()
print("Time:", end-start)
print("cc:", cc, "cc_wo_diab:", cc_wo_diab, "cc_lemma:", cc_lemma, "cc_wo_diab_lemma:", cc_wo_diab_lemma)
print("t1:", t1)
print("t2:", t2)
print("t3:", t3)
print("t4:", t4)
print("t5:", t5)
print("t6:", t6)
print("t7:", t7)
print("t8:", t8)
print("t9:", t9)
# 1000 samples : 6400s
# 300 samples: 865s and 802s in calc cosine function




 ############# i: 0 ( 0.012199878692626953 s)  ##########


New clusters created (effect): ['collect', 'urine', 'sample']   | sim: 0.522433876991272  | cluster_name: insulin pump
		effect: ['collect', 'urine', 'sample']  | cluster_name_effect: insulin pump  | sim_effect: 0.522433876991272
New clusters created (effect): ['stumbling', 'blocks']   | sim: 0.42115408182144165  | cluster_name: insulin spike
		effect: ['stumbling', 'blocks']  | cluster_name_effect: insulin spike  | sim_effect: 0.42115408182144165
New clusters created (effect): ['seizures']   | sim: 0.43465304374694824  | cluster_name: fear
		effect: ['seizures']  | cluster_name_effect: fear  | sim_effect: 0.43465304374694824
New clusters created (effect): ['survive', 'digestion']   | sim: 0.5440065264701843  | cluster_name: fatigue
		effect: ['survive', 'digestion']  | cluster_name_effect: fatigue  | sim_effect: 0.5440065264701843
New clusters created (cause): ['McDonald']   | sim: 0.3946429491043091  | cluster_name: nutrit

New clusters created (effect): ['ass', 'alive']   | sim: 0.5429733991622925  | cluster_name: home
		effect: ['ass', 'alive']  | cluster_name_effect: home  | sim_effect: 0.5429733991622925
New clusters created (cause): ['plummets', '#t1d', '#getthejuiceboxes']   | sim: 0.5226213335990906  | cluster_name: T1D
		cause: ['plummets', '#t1d', '#getthejuiceboxes']  | cluster_name_cause: T1D  | sim_cause: 0.5226213335990906
New clusters created (effect): ['bank', 'account']   | sim: 0.37856730818748474  | cluster_name: insulin prices
		effect: ['bank', 'account']  | cluster_name_effect: insulin prices  | sim_effect: 0.37856730818748474
New clusters created (effect): ['ambulance']   | sim: 0.48328888416290283  | cluster_name: coma
		effect: ['ambulance']  | cluster_name_effect: coma  | sim_effect: 0.48328888416290283
New clusters created (effect): ['smelling', 'fruity']   | sim: 0.5414862036705017  | cluster_name: nutrition
		effect: ['smelling', 'fruity']  | cluster_name_effect: nutrition  | s

New clusters created (effect): ['chill']   | sim: 0.49408355355262756  | cluster_name: overweight
		effect: ['chill']  | cluster_name_effect: overweight  | sim_effect: 0.49408355355262756
New clusters created (effect): ['envious']   | sim: 0.5011279582977295  | cluster_name: fear
		effect: ['envious']  | cluster_name_effect: fear  | sim_effect: 0.5011279582977295
New clusters created (effect): ['Violence', 'begets', 'violence']   | sim: 0.4402768611907959  | cluster_name: theological
		effect: ['Violence', 'begets', 'violence']  | cluster_name_effect: theological  | sim_effect: 0.4402768611907959
New clusters created (effect): ['stop', 'talking']   | sim: 0.5017489790916443  | cluster_name: insulin pump
		effect: ['stop', 'talking']  | cluster_name_effect: insulin pump  | sim_effect: 0.5017489790916443
New clusters created (effect): ['salad']   | sim: 0.5483062267303467  | cluster_name: sugar
		effect: ['salad']  | cluster_name_effect: sugar  | sim_effect: 0.5483062267303467
New cluste

New clusters created (effect): ['rely', 'foodbanks']   | sim: 0.44078612327575684  | cluster_name: diet
		effect: ['rely', 'foodbanks']  | cluster_name_effect: diet  | sim_effect: 0.44078612327575684
New clusters created (cause): ['dietary', 'needs']   | sim: 0.518555760383606  | cluster_name: taking necessary precautions
		cause: ['dietary', 'needs']  | cluster_name_cause: taking necessary precautions  | sim_cause: 0.518555760383606
New clusters created (effect): ['drinking']   | sim: 0.5325705409049988  | cluster_name: sugar
		effect: ['drinking']  | cluster_name_effect: sugar  | sim_effect: 0.5325705409049988
New clusters created (effect): ['fending', 'bankruptcy']   | sim: 0.5181324481964111  | cluster_name: rationing insulin
		effect: ['fending', 'bankruptcy']  | cluster_name_effect: rationing insulin  | sim_effect: 0.5181324481964111
New clusters created (effect): ['shortness', 'breath']   | sim: 0.5144016742706299  | cluster_name: smelling fruity
		effect: ['shortness', 'breath'

New clusters created (effect): ['hurts']   | sim: 0.46915125846862793  | cluster_name: sadness
		effect: ['hurts']  | cluster_name_effect: sadness  | sim_effect: 0.46915125846862793
New clusters created (effect): ['bulging', 'disc']   | sim: 0.45961132645606995  | cluster_name: marching band head
		effect: ['bulging', 'disc']  | cluster_name_effect: marching band head  | sim_effect: 0.45961132645606995
New clusters created (cause): ['weak']   | sim: 0.5004667639732361  | cluster_name: feel bad
		cause: ['weak']  | cluster_name_cause: feel bad  | sim_cause: 0.5004667639732361
New clusters created (effect): ['uneasy']   | sim: 0.3873961269855499  | cluster_name: fear
		effect: ['uneasy']  | cluster_name_effect: fear  | sim_effect: 0.3873961269855499
New clusters created (effect): ['talcum', 'powder']   | sim: 0.44916918873786926  | cluster_name: tomé un vaso de coca cola
		effect: ['talcum', 'powder']  | cluster_name_effect: tomé un vaso de coca cola  | sim_effect: 0.44916918873786926
Ne

New clusters created (effect): ['crave']   | sim: 0.4994548559188843  | cluster_name: sugar
		effect: ['crave']  | cluster_name_effect: sugar  | sim_effect: 0.4994548559188843
New clusters created (effect): ['STRESSED']   | sim: 0.5470821857452393  | cluster_name: depression
		effect: ['STRESSED']  | cluster_name_effect: depression  | sim_effect: 0.5470821857452393
New clusters created (cause): ['Lazy', 'tude']   | sim: 0.5333510637283325  | cluster_name: anger
		cause: ['Lazy', 'tude']  | cluster_name_cause: anger  | sim_cause: 0.5333510637283325
New clusters created (effect): ['comfort', 'foods']   | sim: 0.5452757477760315  | cluster_name: eating healthy
		effect: ['comfort', 'foods']  | cluster_name_effect: eating healthy  | sim_effect: 0.5452757477760315
New clusters created (effect): ['enhanced', 'empathy']   | sim: 0.4331188499927521  | cluster_name: access insulin
		effect: ['enhanced', 'empathy']  | cluster_name_effect: access insulin  | sim_effect: 0.4331188499927521
New clus

New clusters created (effect): ['appetite']   | sim: 0.4382556080818176  | cluster_name: anabolic effect
		effect: ['appetite']  | cluster_name_effect: anabolic effect  | sim_effect: 0.4382556080818176


 ############# i: 4000 ( 1978.3107497692108 s)  ##########


New clusters created (effect): ['grumpy']   | sim: 0.5268161296844482  | cluster_name: sick
		effect: ['grumpy']  | cluster_name_effect: sick  | sim_effect: 0.5268161296844482
New clusters created (effect): ['longer', 'treats']   | sim: 0.5094048380851746  | cluster_name: access insulin
		effect: ['longer', 'treats']  | cluster_name_effect: access insulin  | sim_effect: 0.5094048380851746
New clusters created (effect): ['slightly', 'concerned']   | sim: 0.5470079779624939  | cluster_name: fear
		effect: ['slightly', 'concerned']  | cluster_name_effect: fear  | sim_effect: 0.5470079779624939
New clusters created (cause): ['deli', 'grocery', 'store']   | sim: 0.49456650018692017  | cluster_name: unable to afford insulin
		cause

New clusters created (cause): ['#BTSisNotYourAverageBoyBand', 'BTS']   | sim: 0.40375789999961853  | cluster_name: dIaBeTiC sHOcK
		cause: ['#BTSisNotYourAverageBoyBand', 'BTS']  | cluster_name_cause: dIaBeTiC sHOcK  | sim_cause: 0.40375789999961853
New clusters created (cause): ['moving', 'schools']   | sim: 0.5479943752288818  | cluster_name: home
		cause: ['moving', 'schools']  | cluster_name_cause: home  | sim_cause: 0.5479943752288818
New clusters created (effect): ['win', 'year']   | sim: 0.5005762577056885  | cluster_name: neuropathy
		effect: ['win', 'year']  | cluster_name_effect: neuropathy  | sim_effect: 0.5005762577056885
New clusters created (effect): ['kind', 'emotion']   | sim: 0.525687038898468  | cluster_name: joy
		effect: ['kind', 'emotion']  | cluster_name_effect: joy  | sim_effect: 0.525687038898468
New clusters created (effect): ['call', 'CPS']   | sim: 0.40593430399894714  | cluster_name: moving schools
		effect: ['call', 'CPS']  | cluster_name_effect: moving sch

New clusters created (effect): ['Ciri', 'fighting', 'action']   | sim: 0.5308493971824646  | cluster_name: marching band head
		effect: ['Ciri', 'fighting', 'action']  | cluster_name_effect: marching band head  | sim_effect: 0.5308493971824646
New clusters created (cause): ['GoFundMe']   | sim: 0.35695213079452515  | cluster_name: insulin prices
		cause: ['GoFundMe']  | cluster_name_cause: insulin prices  | sim_cause: 0.35695213079452515
New clusters created (cause): ['savings', 'card']   | sim: 0.5277783870697021  | cluster_name: death
		cause: ['savings', 'card']  | cluster_name_cause: death  | sim_cause: 0.5277783870697021
New clusters created (effect): ['hairy']   | sim: 0.44363874197006226  | cluster_name: hair fall
		effect: ['hairy']  | cluster_name_effect: hair fall  | sim_effect: 0.44363874197006226
New clusters created (effect): ['screwed']   | sim: 0.45708903670310974  | cluster_name: freaked
		effect: ['screwed']  | cluster_name_effect: freaked  | sim_effect: 0.457089036703

New clusters created (cause): ['run']   | sim: 0.4045315086841583  | cluster_name: physical activity
		cause: ['run']  | cluster_name_cause: physical activity  | sim_cause: 0.4045315086841583
New clusters created (effect): ['doctors', 'scoff']   | sim: 0.4918389320373535  | cluster_name: try leave
		effect: ['doctors', 'scoff']  | cluster_name_effect: try leave  | sim_effect: 0.4918389320373535
New clusters created (effect): ['endure', 'shitty', 'circumstances']   | sim: 0.5391743779182434  | cluster_name: taking necessary precautions
		effect: ['endure', 'shitty', 'circumstances']  | cluster_name_effect: taking necessary precautions  | sim_effect: 0.5391743779182434
New clusters created (cause): ['naughty']   | sim: 0.41144707798957825  | cluster_name: Safer Easier Less awkward
		cause: ['naughty']  | cluster_name_cause: Safer Easier Less awkward  | sim_cause: 0.41144707798957825
New clusters created (effect): ['refrain']   | sim: 0.43785685300827026  | cluster_name: stop talking
		ef

New clusters created (effect): ['paranoid']   | sim: 0.5477652549743652  | cluster_name: fear
		effect: ['paranoid']  | cluster_name_effect: fear  | sim_effect: 0.5477652549743652
New clusters created (effect): ['self', 'esteem']   | sim: 0.4634637236595154  | cluster_name: fear
		effect: ['self', 'esteem']  | cluster_name_effect: fear  | sim_effect: 0.4634637236595154


 ############# i: 9000 ( 5379.375747680664 s)  ##########


New clusters created (cause): ['market', 'demand']   | sim: 0.5124329924583435  | cluster_name: unable to afford insulin
		cause: ['market', 'demand']  | cluster_name_cause: unable to afford insulin  | sim_cause: 0.5124329924583435
New clusters created (cause): ['Video', 'Games']   | sim: 0.43989256024360657  | cluster_name: Safer Easier Less awkward
		cause: ['Video', 'Games']  | cluster_name_cause: Safer Easier Less awkward  | sim_cause: 0.43989256024360657
New clusters created (cause): ['viral', 'increase']   | sim: 0.4760858714580536  | cluster_name: lung 

New clusters created (cause): ['potato']   | sim: 0.5055489540100098  | cluster_name: salad
		cause: ['potato']  | cluster_name_cause: salad  | sim_cause: 0.5055489540100098
New clusters created (effect): ['fingersfall']   | sim: 0.49546149373054504  | cluster_name: insulin pump
		effect: ['fingersfall']  | cluster_name_effect: insulin pump  | sim_effect: 0.49546149373054504
New clusters created (effect): ['privilege', '#DiabetesAwarenessMonth']   | sim: 0.46392858028411865  | cluster_name: #Insulin4all #Diaversary
		effect: ['privilege', '#DiabetesAwarenessMonth']  | cluster_name_effect: #Insulin4all #Diaversary  | sim_effect: 0.46392858028411865
New clusters created (effect): ['selling', 'furniture']   | sim: 0.48471131920814514  | cluster_name: unable to afford insulin
		effect: ['selling', 'furniture']  | cluster_name_effect: unable to afford insulin  | sim_effect: 0.48471131920814514
New clusters created (effect): ['wakes']   | sim: 0.5397869944572449  | cluster_name: Insomnia
		e

New clusters created (cause): ['organ', 'transplants']   | sim: 0.5134154558181763  | cluster_name: kidney failure
		cause: ['organ', 'transplants']  | cluster_name_cause: kidney failure  | sim_cause: 0.5134154558181763
New clusters created (effect): ['boost']   | sim: 0.5109707713127136  | cluster_name: improved
		effect: ['boost']  | cluster_name_effect: improved  | sim_effect: 0.5109707713127136
New clusters created (effect): ['drip']   | sim: 0.4319843649864197  | cluster_name: OGTT
		effect: ['drip']  | cluster_name_effect: OGTT  | sim_effect: 0.4319843649864197
New clusters created (cause): ['Christmas', 'baking']   | sim: 0.5494600534439087  | cluster_name: nutrition
		cause: ['Christmas', 'baking']  | cluster_name_cause: nutrition  | sim_cause: 0.5494600534439087


 ############# i: 12000 ( 8502.572264909744 s)  ##########


New clusters created (effect): ['marked', 'PIP', 'assessment']   | sim: 0.4686141312122345  | cluster_name: enhanced empathy
		effect: ['marked', 'PIP', 'a

New clusters created (cause): ['gender', 'dysphoria']   | sim: 0.5132874250411987  | cluster_name: glucose guardian
		cause: ['gender', 'dysphoria']  | cluster_name_cause: glucose guardian  | sim_cause: 0.5132874250411987
New clusters created (effect): ['rash']   | sim: 0.4586455523967743  | cluster_name: lung infection
		effect: ['rash']  | cluster_name_effect: lung infection  | sim_effect: 0.4586455523967743
New clusters created (effect): ['MRSA', 'UTI']   | sim: 0.4996260702610016  | cluster_name: lung infection
		effect: ['MRSA', 'UTI']  | cluster_name_effect: lung infection  | sim_effect: 0.4996260702610016
New clusters created (cause): ['keystone', 'pipeline']   | sim: 0.4193751811981201  | cluster_name: insulin spike
		cause: ['keystone', 'pipeline']  | cluster_name_cause: insulin spike  | sim_cause: 0.4193751811981201
New clusters created (effect): ['sting']   | sim: 0.480792760848999  | cluster_name: insulin pump
		effect: ['sting']  | cluster_name_effect: insulin pump  | sim_

New clusters created (cause): ['vaccine']   | sim: 0.41035836935043335  | cluster_name: covid
		cause: ['vaccine']  | cluster_name_cause: covid  | sim_cause: 0.41035836935043335
New clusters created (effect): ['site', 'rips']   | sim: 0.4639125466346741  | cluster_name: freaked
		effect: ['site', 'rips']  | cluster_name_effect: freaked  | sim_effect: 0.4639125466346741
New clusters created (effect): ['caloric', 'deficits']   | sim: 0.475799560546875  | cluster_name: nutrition
		effect: ['caloric', 'deficits']  | cluster_name_effect: nutrition  | sim_effect: 0.475799560546875
New clusters created (effect): ['primary', 'source', 'fuel']   | sim: 0.5080434083938599  | cluster_name: burning ketones
		effect: ['primary', 'source', 'fuel']  | cluster_name_effect: burning ketones  | sim_effect: 0.5080434083938599
New clusters created (effect): ['BEER']   | sim: 0.4222472906112671  | cluster_name: DAIBETIC KETO ACIDS high
		effect: ['BEER']  | cluster_name_effect: DAIBETIC KETO ACIDS high  | s

New clusters created (effect): ['entire', 'meal']   | sim: 0.549310564994812  | cluster_name: NEEDED meal
		effect: ['entire', 'meal']  | cluster_name_effect: NEEDED meal  | sim_effect: 0.549310564994812
New clusters created (cause): ['hereditary']   | sim: 0.42829790711402893  | cluster_name: genetic
		cause: ['hereditary']  | cluster_name_cause: genetic  | sim_cause: 0.42829790711402893
New clusters created (effect): ['skipping', 'inhalers']   | sim: 0.5082805752754211  | cluster_name: unable to afford insulin
		effect: ['skipping', 'inhalers']  | cluster_name_effect: unable to afford insulin  | sim_effect: 0.5082805752754211
New clusters created (effect): ['breathe']   | sim: 0.500429093837738  | cluster_name: shortness breath
		effect: ['breathe']  | cluster_name_effect: shortness breath  | sim_effect: 0.500429093837738
New clusters created (effect): ['psychologically', 'fragile']   | sim: 0.5427665710449219  | cluster_name: psychological grief
		effect: ['psychologically', 'fragil

New clusters created (effect): ['cheer', '#buyingnudes']   | sim: 0.43942371010780334  | cluster_name: rationing insulin
		effect: ['cheer', '#buyingnudes']  | cluster_name_effect: rationing insulin  | sim_effect: 0.43942371010780334
New clusters created (cause): ['surplus', 'affection']   | sim: 0.4698772430419922  | cluster_name: anabolic effect
		cause: ['surplus', 'affection']  | cluster_name_cause: anabolic effect  | sim_cause: 0.4698772430419922
New clusters created (effect): ['teddy', 'bear']   | sim: 0.4625510275363922  | cluster_name: cotton balls
		effect: ['teddy', 'bear']  | cluster_name_effect: cotton balls  | sim_effect: 0.4625510275363922
New clusters created (effect): ['hosed']   | sim: 0.4994410574436188  | cluster_name: heavily enforced sanctions
		effect: ['hosed']  | cluster_name_effect: heavily enforced sanctions  | sim_effect: 0.4994410574436188
New clusters created (effect): ['mindful']   | sim: 0.44064104557037354  | cluster_name: eating healthy
		effect: ['mind

New clusters created (cause): ['voice', 'recognition', 'software']   | sim: 0.5113099813461304  | cluster_name: projecting image
		cause: ['voice', 'recognition', 'software']  | cluster_name_cause: projecting image  | sim_cause: 0.5113099813461304
New clusters created (effect): ['spells', 'dizziness']   | sim: 0.5285618305206299  | cluster_name: needles sensation
		effect: ['spells', 'dizziness']  | cluster_name_effect: needles sensation  | sim_effect: 0.5285618305206299
New clusters created (effect): ['blurring']   | sim: 0.5263522863388062  | cluster_name: disoriented slurred speech
		effect: ['blurring']  | cluster_name_effect: disoriented slurred speech  | sim_effect: 0.5263522863388062


 ############# i: 22000 ( 22283.07185101509 s)  ##########


New clusters created (effect): ['portion', 'counts']   | sim: 0.5448687076568604  | cluster_name: nutrition
		effect: ['portion', 'counts']  | cluster_name_effect: nutrition  | sim_effect: 0.5448687076568604
New clusters created (cause):

New clusters created (cause): ['almost']   | sim: 0.41271674633026123  | cluster_name: managed pint half
		cause: ['almost']  | cluster_name_cause: managed pint half  | sim_cause: 0.41271674633026123
New clusters created (effect): ['slowed']   | sim: 0.40054643154144287  | cluster_name: longer treats
		effect: ['slowed']  | cluster_name_effect: longer treats  | sim_effect: 0.40054643154144287
New clusters created (effect): ['pray']   | sim: 0.4742697477340698  | cluster_name: family
		effect: ['pray']  | cluster_name_effect: family  | sim_effect: 0.4742697477340698
New clusters created (cause): ['smoker']   | sim: 0.4866907596588135  | cluster_name: smoking
		cause: ['smoker']  | cluster_name_cause: smoking  | sim_cause: 0.4866907596588135
New clusters created (cause): ['toxic', 'masculinity']   | sim: 0.4758455455303192  | cluster_name: Excess everything
		cause: ['toxic', 'masculinity']  | cluster_name_cause: Excess everything  | sim_cause: 0.4758455455303192
New clusters created (ef

New clusters created (effect): ['restart', 'college', 'fund']   | sim: 0.5168526768684387  | cluster_name: moving schools
		effect: ['restart', 'college', 'fund']  | cluster_name_effect: moving schools  | sim_effect: 0.5168526768684387
New clusters created (effect): ['relieve', 'urself']   | sim: 0.5314421653747559  | cluster_name: constant pain
		effect: ['relieve', 'urself']  | cluster_name_effect: constant pain  | sim_effect: 0.5314421653747559
New clusters created (effect): ['greatest']   | sim: 0.45560967922210693  | cluster_name: joy
		effect: ['greatest']  | cluster_name_effect: joy  | sim_effect: 0.45560967922210693
New clusters created (effect): ['weird']   | sim: 0.4599660634994507  | cluster_name: feel bad
		effect: ['weird']  | cluster_name_effect: feel bad  | sim_effect: 0.4599660634994507
New clusters created (cause): ['theatre']   | sim: 0.40538811683654785  | cluster_name: butcher timing
		cause: ['theatre']  | cluster_name_cause: butcher timing  | sim_cause: 0.40538811



 ############# i: 29000 ( 33267.263135910034 s)  ##########


New clusters created (effect): ['bite', 'tongue']   | sim: 0.5070944428443909  | cluster_name: sick
		effect: ['bite', 'tongue']  | cluster_name_effect: sick  | sim_effect: 0.5070944428443909
New clusters created (effect): ['natural', 'selection']   | sim: 0.5168042182922363  | cluster_name: nightmare finding #sugargfree products
		effect: ['natural', 'selection']  | cluster_name_effect: nightmare finding #sugargfree products  | sim_effect: 0.5168042182922363
New clusters created (effect): ['trade', 'war', 'China']   | sim: 0.5161822438240051  | cluster_name: slave population
		effect: ['trade', 'war', 'China']  | cluster_name_effect: slave population  | sim_effect: 0.5161822438240051
New clusters created (effect): ['spotlight']   | sim: 0.3627627193927765  | cluster_name: campus shutting
		effect: ['spotlight']  | cluster_name_effect: campus shutting  | sim_effect: 0.3627627193927765
New clusters created (effect): ['grit'

New clusters created (effect): ['Peripheral', 'Neuropathy']   | sim: 0.4280976951122284  | cluster_name: neuropathy
		effect: ['Peripheral', 'Neuropathy']  | cluster_name_effect: neuropathy  | sim_effect: 0.4280976951122284
New clusters created (effect): ['quadruple', 'bypass']   | sim: 0.4990633726119995  | cluster_name: hospital
		effect: ['quadruple', 'bypass']  | cluster_name_effect: hospital  | sim_effect: 0.4990633726119995


 ############# i: 32000 ( 36842.88152074814 s)  ##########


New clusters created (effect): ['ring']   | sim: 0.4900592863559723  | cluster_name: medication
		effect: ['ring']  | cluster_name_effect: medication  | sim_effect: 0.4900592863559723
New clusters created (effect): ['remain', 'top', 'caring']   | sim: 0.5456619262695312  | cluster_name: others perceptions
		effect: ['remain', 'top', 'caring']  | cluster_name_effect: others perceptions  | sim_effect: 0.5456619262695312
New clusters created (effect): ['dip']   | sim: 0.42586958408355713  | cluster_na

New clusters created (cause): ['Spencer']   | sim: 0.42828109860420227  | cluster_name: Stephen King Novel
		cause: ['Spencer']  | cluster_name_cause: Stephen King Novel  | sim_cause: 0.42828109860420227
New clusters created (effect): ['TIRED']   | sim: 0.4414542615413666  | cluster_name: sick
		effect: ['TIRED']  | cluster_name_effect: sick  | sim_effect: 0.4414542615413666


 ############# i: 35000 ( 40509.341207027435 s)  ##########


New clusters created (cause): ['fitness']   | sim: 0.46986207365989685  | cluster_name: nutrition
		cause: ['fitness']  | cluster_name_cause: nutrition  | sim_cause: 0.46986207365989685
New clusters created (cause): ['outlet']   | sim: 0.40381497144699097  | cluster_name: deemed ‘ adult
		cause: ['outlet']  | cluster_name_cause: deemed ‘ adult  | sim_cause: 0.40381497144699097
New clusters created (effect): ['saved']   | sim: 0.4924960136413574  | cluster_name: putting anyone
		effect: ['saved']  | cluster_name_effect: putting anyone  | sim_effect: 0.4

New clusters created (effect): ['procedure']   | sim: 0.5001049041748047  | cluster_name: hospital
		effect: ['procedure']  | cluster_name_effect: hospital  | sim_effect: 0.5001049041748047
New clusters created (effect): ['indulge']   | sim: 0.45509323477745056  | cluster_name: nutrition
		effect: ['indulge']  | cluster_name_effect: nutrition  | sim_effect: 0.45509323477745056
New clusters created (effect): ['hungry', 'outside']   | sim: 0.5274745225906372  | cluster_name: sick
		effect: ['hungry', 'outside']  | cluster_name_effect: sick  | sim_effect: 0.5274745225906372
New clusters created (effect): ['consider', 'exact', 'calorie']   | sim: 0.5390933752059937  | cluster_name: nutrition
		effect: ['consider', 'exact', 'calorie']  | cluster_name_effect: nutrition  | sim_effect: 0.5390933752059937


 ############# i: 38000 ( 44362.51321578026 s)  ##########


New clusters created (cause): ['criminally', 'overpriced']   | sim: 0.5018311738967896  | cluster_name: anger
		cause: ['criminal

New clusters created (cause): ['deductible', 'reset']   | sim: 0.5279511213302612  | cluster_name: insurance
		cause: ['deductible', 'reset']  | cluster_name_cause: insurance  | sim_cause: 0.5279511213302612
New clusters created (effect): ['cancerous', 'lump', 'side']   | sim: 0.5278169512748718  | cluster_name: breast removed
		effect: ['cancerous', 'lump', 'side']  | cluster_name_effect: breast removed  | sim_effect: 0.5278169512748718
New clusters created (effect): ['allegedly', 'REUSE', 'SYRINGES']   | sim: 0.5398638844490051  | cluster_name: needles sensation
		effect: ['allegedly', 'REUSE', 'SYRINGES']  | cluster_name_effect: needles sensation  | sim_effect: 0.5398638844490051
New clusters created (cause): ['testosterone', 'hush']   | sim: 0.45566558837890625  | cluster_name: steroid dependant
		cause: ['testosterone', 'hush']  | cluster_name_cause: steroid dependant  | sim_cause: 0.45566558837890625


 ############# i: 41000 ( 48329.1665520668 s)  ##########


New clusters creat

New clusters created (effect): ['arterial', 'scans']   | sim: 0.5206954479217529  | cluster_name: googled urinary frequency
		effect: ['arterial', 'scans']  | cluster_name_effect: googled urinary frequency  | sim_effect: 0.5206954479217529
New clusters created (effect): ['worrying', 'COVID']   | sim: 0.5384017825126648  | cluster_name: triggering ED thinking
		effect: ['worrying', 'COVID']  | cluster_name_effect: triggering ED thinking  | sim_effect: 0.5384017825126648
New clusters created (effect): ['confirm', 'area']   | sim: 0.4898506700992584  | cluster_name: French data physician density
		effect: ['confirm', 'area']  | cluster_name_effect: French data physician density  | sim_effect: 0.4898506700992584
New clusters created (effect): ['bloating']   | sim: 0.54928058385849  | cluster_name: sick
		effect: ['bloating']  | cluster_name_effect: sick  | sim_effect: 0.54928058385849
New clusters created (cause): ['named', 'Lamar']   | sim: 0.529701292514801  | cluster_name: prediabetes
	

New clusters created (effect): ['colostrum', 'harvesting']   | sim: 0.47710078954696655  | cluster_name: dental extraction
		effect: ['colostrum', 'harvesting']  | cluster_name_effect: dental extraction  | sim_effect: 0.47710078954696655
New clusters created (effect): ['shed']   | sim: 0.49121373891830444  | cluster_name: overweight
		effect: ['shed']  | cluster_name_effect: overweight  | sim_effect: 0.49121373891830444
New clusters created (effect): ['blue']   | sim: 0.4630165696144104  | cluster_name: wearing bigger size shoe
		effect: ['blue']  | cluster_name_effect: wearing bigger size shoe  | sim_effect: 0.4630165696144104
New clusters created (effect): ['premature', 'aging']   | sim: 0.5059956908226013  | cluster_name: heart attack
		effect: ['premature', 'aging']  | cluster_name_effect: heart attack  | sim_effect: 0.5059956908226013
New clusters created (effect): ['homeopathic', 'aid']   | sim: 0.511407732963562  | cluster_name: took Kool Aid
		effect: ['homeopathic', 'aid']  | 

New clusters created (effect): ['EXPIRED', 'INSULIN']   | sim: 0.5199784636497498  | cluster_name: MEDICATION
		effect: ['EXPIRED', 'INSULIN']  | cluster_name_effect: MEDICATION  | sim_effect: 0.5199784636497498
New clusters created (cause): ['Hindu', 'temple']   | sim: 0.3889354169368744  | cluster_name: bathroom damp ceiling
		cause: ['Hindu', 'temple']  | cluster_name_cause: bathroom damp ceiling  | sim_cause: 0.3889354169368744
New clusters created (effect): ['au', 'fait']   | sim: 0.5093731880187988  | cluster_name: tomé un vaso de coca cola
		effect: ['au', 'fait']  | cluster_name_effect: tomé un vaso de coca cola  | sim_effect: 0.5093731880187988
New clusters created (effect): ['SURVIVE']   | sim: 0.5228509306907654  | cluster_name: STAY ALIVE
		effect: ['SURVIVE']  | cluster_name_effect: STAY ALIVE  | sim_effect: 0.5228509306907654
New clusters created (effect): ['warranty']   | sim: 0.4836924374103546  | cluster_name: continuous glucose monitor
		effect: ['warranty']  | cluste

New clusters created (effect): ['damaging']   | sim: 0.5016979575157166  | cluster_name: dangerous
		effect: ['damaging']  | cluster_name_effect: dangerous  | sim_effect: 0.5016979575157166
New clusters created (effect): ['realities']   | sim: 0.5376580953598022  | cluster_name: others perceptions
		effect: ['realities']  | cluster_name_effect: others perceptions  | sim_effect: 0.5376580953598022
New clusters created (cause): ['Taco', 'Bell']   | sim: 0.41216400265693665  | cluster_name: McDonald
		cause: ['Taco', 'Bell']  | cluster_name_cause: McDonald  | sim_cause: 0.41216400265693665
New clusters created (cause): ['history']   | sim: 0.5215290784835815  | cluster_name: family
		cause: ['history']  | cluster_name_cause: family  | sim_cause: 0.5215290784835815
New clusters created (cause): ['20mg', 'prednisone']   | sim: 0.5009660720825195  | cluster_name: Metformin HCL
		cause: ['20mg', 'prednisone']  | cluster_name_cause: Metformin HCL  | sim_cause: 0.5009660720825195
New clusters c

New clusters created (effect): ['paralysis']   | sim: 0.46070072054862976  | cluster_name: slaughtering asthma
		effect: ['paralysis']  | cluster_name_effect: slaughtering asthma  | sim_effect: 0.46070072054862976


 ############# i: 57000 ( 73315.62573480606 s)  ##########


New clusters created (cause): ['finals']   | sim: 0.4594026207923889  | cluster_name: marching band head
		cause: ['finals']  | cluster_name_cause: marching band head  | sim_cause: 0.4594026207923889
New clusters created (cause): ['HC', 'abuses']   | sim: 0.5154346227645874  | cluster_name: abuser
		cause: ['HC', 'abuses']  | cluster_name_cause: abuser  | sim_cause: 0.5154346227645874
New clusters created (cause): ['marks', '#AprokoDoctorAsks']   | sim: 0.4345439374446869  | cluster_name: birthday
		cause: ['marks', '#AprokoDoctorAsks']  | cluster_name_cause: birthday  | sim_cause: 0.4345439374446869
New clusters created (cause): ['forget']   | sim: 0.45195481181144714  | cluster_name: never speak
		cause: ['forge

New clusters created (effect): ['cysts', 'ovaries']   | sim: 0.48401379585266113  | cluster_name: googled urinary frequency
		effect: ['cysts', 'ovaries']  | cluster_name_effect: googled urinary frequency  | sim_effect: 0.48401379585266113
New clusters created (effect): ['eyeliner', 'lasting']   | sim: 0.5030367374420166  | cluster_name: scalping consoles
		effect: ['eyeliner', 'lasting']  | cluster_name_effect: scalping consoles  | sim_effect: 0.5030367374420166
New clusters created (effect): ['nuts', 'snipped']   | sim: 0.5014328360557556  | cluster_name: fasting
		effect: ['nuts', 'snipped']  | cluster_name_effect: fasting  | sim_effect: 0.5014328360557556
New clusters created (effect): ['unfit']   | sim: 0.3676578104496002  | cluster_name: overweight
		effect: ['unfit']  | cluster_name_effect: overweight  | sim_effect: 0.3676578104496002
New clusters created (cause): ['fiber', 'mesh']   | sim: 0.5148770809173584  | cluster_name: nutrition
		cause: ['fiber', 'mesh']  | cluster_name_

New clusters created (effect): ['Nobody', 'laughs']   | sim: 0.5057479739189148  | cluster_name: Lazy tude
		effect: ['Nobody', 'laughs']  | cluster_name_effect: Lazy tude  | sim_effect: 0.5057479739189148


 ############# i: 65000 ( 88557.36402392387 s)  ##########


New clusters created (effect): ['mad']   | sim: 0.5184951424598694  | cluster_name: sadness
		effect: ['mad']  | cluster_name_effect: sadness  | sim_effect: 0.5184951424598694
New clusters created (effect): ['loading']   | sim: 0.45882731676101685  | cluster_name: non runner
		effect: ['loading']  | cluster_name_effect: non runner  | sim_effect: 0.45882731676101685
New clusters created (effect): ['fatal', 'case']   | sim: 0.47369369864463806  | cluster_name: release tension
		effect: ['fatal', 'case']  | cluster_name_effect: release tension  | sim_effect: 0.47369369864463806
New clusters created (cause): ['antis']   | sim: 0.4914916753768921  | cluster_name: tomé un vaso de coca cola
		cause: ['antis']  | cluster_name_cau

New clusters created (cause): ['childhood']   | sim: 0.48257386684417725  | cluster_name: mortality
		cause: ['childhood']  | cluster_name_cause: mortality  | sim_cause: 0.48257386684417725
New clusters created (effect): [':up_arrow:', 'Nephew']   | sim: 0.42975154519081116  | cluster_name: family
		effect: [':up_arrow:', 'Nephew']  | cluster_name_effect: family  | sim_effect: 0.42975154519081116
New clusters created (effect): ['Levemir']   | sim: 0.4934660494327545  | cluster_name: humalog
		effect: ['Levemir']  | cluster_name_effect: humalog  | sim_effect: 0.4934660494327545
New clusters created (effect): ['medically', 'challenged']   | sim: 0.5436127185821533  | cluster_name: taking necessary precautions
		effect: ['medically', 'challenged']  | cluster_name_effect: taking necessary precautions  | sim_effect: 0.5436127185821533
New clusters created (cause): ['heat', 'wave']   | sim: 0.4995037615299225  | cluster_name: cold
		cause: ['heat', 'wave']  | cluster_name_cause: cold  | sim_



 ############# i: 73000 ( 106786.98294377327 s)  ##########


New clusters created (effect): ['discomfort']   | sim: 0.5002989768981934  | cluster_name: constant pain
		effect: ['discomfort']  | cluster_name_effect: constant pain  | sim_effect: 0.5002989768981934
New clusters created (effect): ['medicaid']   | sim: 0.5039819478988647  | cluster_name: insurance
		effect: ['medicaid']  | cluster_name_effect: insurance  | sim_effect: 0.5039819478988647
New clusters created (effect): ['Vomitted']   | sim: 0.5132993459701538  | cluster_name: sick
		effect: ['Vomitted']  | cluster_name_effect: sick  | sim_effect: 0.5132993459701538
New clusters created (effect): ['rebellious', '#hyperglycemia']   | sim: 0.5331478118896484  | cluster_name: hyperglycemic
		effect: ['rebellious', '#hyperglycemia']  | cluster_name_effect: hyperglycemic  | sim_effect: 0.5331478118896484
New clusters created (effect): ['correlation', 'engine']   | sim: 0.49252739548683167  | cluster_name: Humans monetized water


New clusters created (effect): ['kilter']   | sim: 0.4676972031593323  | cluster_name: blowing morning
		effect: ['kilter']  | cluster_name_effect: blowing morning  | sim_effect: 0.4676972031593323
New clusters created (cause): ['inherit']   | sim: 0.5477747917175293  | cluster_name: hereditary
		cause: ['inherit']  | cluster_name_cause: hereditary  | sim_cause: 0.5477747917175293


 ############# i: 78000 ( 115393.41646003723 s)  ##########


New clusters created (cause): [':grinning_cat_face:', ':pencil:']   | sim: 0.42653894424438477  | cluster_name: embark machu picchu inca trail
		cause: [':grinning_cat_face:', ':pencil:']  | cluster_name_cause: embark machu picchu inca trail  | sim_cause: 0.42653894424438477
New clusters created (effect): ['FRUSTRATING']   | sim: 0.5368910431861877  | cluster_name: STRESSED
		effect: ['FRUSTRATING']  | cluster_name_effect: STRESSED  | sim_effect: 0.5368910431861877


 ############# i: 79000 ( 117124.08388400078 s)  ##########


New clusters creat

New clusters created (effect): ['mind', 'deteriorate']   | sim: 0.545403242111206  | cluster_name: piece mind
		effect: ['mind', 'deteriorate']  | cluster_name_effect: piece mind  | sim_effect: 0.545403242111206
New clusters created (cause): ['generosity']   | sim: 0.420783132314682  | cluster_name: donations
		cause: ['generosity']  | cluster_name_cause: donations  | sim_cause: 0.420783132314682
New clusters created (cause): ['Myo', 'Insitol', 'supplements']   | sim: 0.503725528717041  | cluster_name: formula supplementing
		cause: ['Myo', 'Insitol', 'supplements']  | cluster_name_cause: formula supplementing  | sim_cause: 0.503725528717041
New clusters created (effect): ['hopeless']   | sim: 0.4587002396583557  | cluster_name: bitchy
		effect: ['hopeless']  | cluster_name_effect: bitchy  | sim_effect: 0.4587002396583557
New clusters created (cause): ['squats']   | sim: 0.3893260955810547  | cluster_name: grouch
		cause: ['squats']  | cluster_name_cause: grouch  | sim_cause: 0.3893260

New clusters created (effect): ['triaged']   | sim: 0.41565510630607605  | cluster_name: GP practice
		effect: ['triaged']  | cluster_name_effect: GP practice  | sim_effect: 0.41565510630607605
New clusters created (effect): ['identification']   | sim: 0.5434901714324951  | cluster_name: cultural expression
		effect: ['identification']  | cluster_name_effect: cultural expression  | sim_effect: 0.5434901714324951


 ############# i: 88000 ( 133136.11154294014 s)  ##########


New clusters created (effect): ['gall']   | sim: 0.48757806420326233  | cluster_name: lung infection
		effect: ['gall']  | cluster_name_effect: lung infection  | sim_effect: 0.48757806420326233
New clusters created (effect): ['comparative', 'graph']   | sim: 0.485094279050827  | cluster_name: complexion seems clearer
		effect: ['comparative', 'graph']  | cluster_name_effect: complexion seems clearer  | sim_effect: 0.485094279050827
New clusters created (cause): ['fun', 'intro']   | sim: 0.5371676683425903  | cluste

New clusters created (effect): ['doze']   | sim: 0.4438900649547577  | cluster_name: managed pint half
		effect: ['doze']  | cluster_name_effect: managed pint half  | sim_effect: 0.4438900649547577


 ############# i: 94000 ( 144081.92154479027 s)  ##########


New clusters created (effect): ['June']   | sim: 0.47451210021972656  | cluster_name: sterilize apartment
		effect: ['June']  | cluster_name_effect: sterilize apartment  | sim_effect: 0.47451210021972656
New clusters created (cause): ['east']   | sim: 0.467912495136261  | cluster_name: embark machu picchu inca trail
		cause: ['east']  | cluster_name_cause: embark machu picchu inca trail  | sim_cause: 0.467912495136261
New clusters created (effect): ['CoNcErNeD']   | sim: 0.3067726194858551  | cluster_name: cringe
		effect: ['CoNcErNeD']  | cluster_name_effect: cringe  | sim_effect: 0.3067726194858551
New clusters created (effect): ['fuzzy']   | sim: 0.41597509384155273  | cluster_name: bruise
		effect: ['fuzzy']  | cluster_name_

New clusters created (effect): ['hormonal', 'turbulence']   | sim: 0.5411224961280823  | cluster_name: chemical imbalance
		effect: ['hormonal', 'turbulence']  | cluster_name_effect: chemical imbalance  | sim_effect: 0.5411224961280823
New clusters created (cause): ['#ENGSWE', 'game', '#WorldCup']   | sim: 0.4675455391407013  | cluster_name: Competitive soccer
		cause: ['#ENGSWE', 'game', '#WorldCup']  | cluster_name_cause: Competitive soccer  | sim_cause: 0.4675455391407013
New clusters created (effect): ['knot']   | sim: 0.5317652225494385  | cluster_name: plugged nose
		effect: ['knot']  | cluster_name_effect: plugged nose  | sim_effect: 0.5317652225494385


 ############# i: 99000 ( 154231.28252077103 s)  ##########


New clusters created (effect): ['delusional']   | sim: 0.5180511474609375  | cluster_name: endure shitty circumstances
		effect: ['delusional']  | cluster_name_effect: endure shitty circumstances  | sim_effect: 0.5180511474609375
New clusters created (effect): ['sprin

New clusters created (effect): ['biologically', 'impossible']   | sim: 0.5385594964027405  | cluster_name: psychologically fragile
		effect: ['biologically', 'impossible']  | cluster_name_effect: psychologically fragile  | sim_effect: 0.5385594964027405
New clusters created (effect): ['wean']   | sim: 0.3979351222515106  | cluster_name: fear
		effect: ['wean']  | cluster_name_effect: fear  | sim_effect: 0.3979351222515106
New clusters created (cause): ['string', 'cheese']   | sim: 0.5459138751029968  | cluster_name: topical cream
		cause: ['string', 'cheese']  | cluster_name_cause: topical cream  | sim_cause: 0.5459138751029968
New clusters created (effect): ['predict', 'HbA', '1c']   | sim: 0.5494145154953003  | cluster_name: A1C
		effect: ['predict', 'HbA', '1c']  | cluster_name_effect: A1C  | sim_effect: 0.5494145154953003
New clusters created (effect): ['KILL']   | sim: 0.4515026807785034  | cluster_name: DIE
		effect: ['KILL']  | cluster_name_effect: DIE  | sim_effect: 0.451502680

New clusters created (cause): ['Rx']   | sim: 0.3665546774864197  | cluster_name: medication
		cause: ['Rx']  | cluster_name_cause: medication  | sim_cause: 0.3665546774864197
New clusters created (effect): ['scent']   | sim: 0.5214502811431885  | cluster_name: smelling fruity
		effect: ['scent']  | cluster_name_effect: smelling fruity  | sim_effect: 0.5214502811431885
New clusters created (effect): ['wears', 'Star', 'Wars', 'stuff']   | sim: 0.5328013896942139  | cluster_name: wearing bigger size shoe
		effect: ['wears', 'Star', 'Wars', 'stuff']  | cluster_name_effect: wearing bigger size shoe  | sim_effect: 0.5328013896942139
New clusters created (cause): ['PhD']   | sim: 0.5168188214302063  | cluster_name: endocrinology fellowship
		cause: ['PhD']  | cluster_name_cause: endocrinology fellowship  | sim_cause: 0.5168188214302063
New clusters created (effect): ['Di', 'masarap']   | sim: 0.5111304521560669  | cluster_name: mata va matar
		effect: ['Di', 'masarap']  | cluster_name_effect

New clusters created (effect): ['siked']   | sim: 0.5318372845649719  | cluster_name: freaked
		effect: ['siked']  | cluster_name_effect: freaked  | sim_effect: 0.5318372845649719
New clusters created (effect): ['shop', 'Target']   | sim: 0.46484866738319397  | cluster_name: prismatic Fortran bulls
		effect: ['shop', 'Target']  | cluster_name_effect: prismatic Fortran bulls  | sim_effect: 0.46484866738319397


 ############# i: 116000 ( 203870.39179182053 s)  ##########


New clusters created (effect): ['PRECAUTIONS']   | sim: 0.46377772092819214  | cluster_name: DAIBETIC KETO ACIDS high
		effect: ['PRECAUTIONS']  | cluster_name_effect: DAIBETIC KETO ACIDS high  | sim_effect: 0.46377772092819214
New clusters created (effect): ['kms']   | sim: 0.3717089295387268  | cluster_name: PC panties
		effect: ['kms']  | cluster_name_effect: PC panties  | sim_effect: 0.3717089295387268
New clusters created (effect): ['struggles', 'mines']   | sim: 0.5060065388679504  | cluster_name: struggle
		eff

New clusters created (effect): ['Fantasy', 'Football', 'Manager']   | sim: 0.5046530365943909  | cluster_name: Video Games
		effect: ['Fantasy', 'Football', 'Manager']  | cluster_name_effect: Video Games  | sim_effect: 0.5046530365943909
New clusters created (effect): ['dodgy', ':face_with_thermometer:']   | sim: 0.45153215527534485  | cluster_name: breast removed
		effect: ['dodgy', ':face_with_thermometer:']  | cluster_name_effect: breast removed  | sim_effect: 0.45153215527534485
New clusters created (effect): ['refresh']   | sim: 0.38645368814468384  | cluster_name: hourglass tick
		effect: ['refresh']  | cluster_name_effect: hourglass tick  | sim_effect: 0.38645368814468384
New clusters created (effect): ['destructive', 'cycle']   | sim: 0.5363823175430298  | cluster_name: irregular cycles
		effect: ['destructive', 'cycle']  | cluster_name_effect: irregular cycles  | sim_effect: 0.5363823175430298
New clusters created (cause): ['sour', 'angst']   | sim: 0.5483502745628357  | clust

New clusters created (effect): ['moderate']   | sim: 0.501092255115509  | cluster_name: moderation
		effect: ['moderate']  | cluster_name_effect: moderation  | sim_effect: 0.501092255115509
New clusters created (effect): ['overwhelm', 'lymph', 'glands']   | sim: 0.5088109374046326  | cluster_name: lung infection
		effect: ['overwhelm', 'lymph', 'glands']  | cluster_name_effect: lung infection  | sim_effect: 0.5088109374046326
New clusters created (cause): ['bacteria', 'threatening']   | sim: 0.5412824153900146  | cluster_name: gut
		cause: ['bacteria', 'threatening']  | cluster_name_cause: gut  | sim_cause: 0.5412824153900146
New clusters created (cause): ['silicone', 'poisoning']   | sim: 0.4830162823200226  | cluster_name: ear wax
		cause: ['silicone', 'poisoning']  | cluster_name_cause: ear wax  | sim_cause: 0.4830162823200226
New clusters created (cause): ['OJ']   | sim: 0.42624327540397644  | cluster_name: vodka
		cause: ['OJ']  | cluster_name_cause: vodka  | sim_cause: 0.42624327

New clusters created (effect): [':fire:', ':fire:', ':fire:']   | sim: 0.22834815084934235  | cluster_name: :grinning_cat_face: :pencil:
		effect: [':fire:', ':fire:', ':fire:']  | cluster_name_effect: :grinning_cat_face: :pencil:  | sim_effect: 0.22834815084934235
New clusters created (effect): ['robbed']   | sim: 0.4715302884578705  | cluster_name: vomited infront
		effect: ['robbed']  | cluster_name_effect: vomited infront  | sim_effect: 0.4715302884578705
New clusters created (effect): ['penis', 'envy']   | sim: 0.49729418754577637  | cluster_name: crushed skull
		effect: ['penis', 'envy']  | cluster_name_effect: crushed skull  | sim_effect: 0.49729418754577637


 ############# i: 133000 ( 243635.74712491035 s)  ##########


New clusters created (effect): ['Hahn', 'Super', 'Dry']   | sim: 0.5125468373298645  | cluster_name: Russell Stover
		effect: ['Hahn', 'Super', 'Dry']  | cluster_name_effect: Russell Stover  | sim_effect: 0.5125468373298645
New clusters created (cause): ['jet',

New clusters created (cause): ['arriving', 'warm']   | sim: 0.5430986881256104  | cluster_name: hot steaming
		cause: ['arriving', 'warm']  | cluster_name_cause: hot steaming  | sim_cause: 0.5430986881256104
New clusters created (effect): ['bail']   | sim: 0.38371220231056213  | cluster_name: lashed
		effect: ['bail']  | cluster_name_effect: lashed  | sim_effect: 0.38371220231056213
New clusters created (cause): ['Greek', 'Tina']   | sim: 0.47203412652015686  | cluster_name: Arnold Palmer
		cause: ['Greek', 'Tina']  | cluster_name_cause: Arnold Palmer  | sim_cause: 0.47203412652015686
New clusters created (cause): ['Disabled', 'Vet']   | sim: 0.5496018528938293  | cluster_name: disabled spoonie
		cause: ['Disabled', 'Vet']  | cluster_name_cause: disabled spoonie  | sim_cause: 0.5496018528938293
New clusters created (effect): ['sassy']   | sim: 0.5086559653282166  | cluster_name: grumpy
		effect: ['sassy']  | cluster_name_effect: grumpy  | sim_effect: 0.5086559653282166
New clusters cre



 ############# i: 145000 ( 278416.40447092056 s)  ##########


New clusters created (cause): ['Netflix', 'series']   | sim: 0.5395132899284363  | cluster_name: episode
		cause: ['Netflix', 'series']  | cluster_name_cause: episode  | sim_cause: 0.5395132899284363
New clusters created (effect): [':cooking:', ':bacon:', '#createbeauty', '#createhappiness']   | sim: 0.393110454082489  | cluster_name: :regional_indicator_symbol_letter_s: :regional_indicator_symbol_letter_u: :regional_indicator_symbol_letter_s:
		effect: [':cooking:', ':bacon:', '#createbeauty', '#createhappiness']  | cluster_name_effect: :regional_indicator_symbol_letter_s: :regional_indicator_symbol_letter_u: :regional_indicator_symbol_letter_s:  | sim_effect: 0.393110454082489
New clusters created (cause): ['paycheck']   | sim: 0.35899606347084045  | cluster_name: unable to afford insulin
		cause: ['paycheck']  | cluster_name_cause: unable to afford insulin  | sim_cause: 0.35899606347084045
New clusters created (effect)

In [16]:
manual_clusters.shape

(1915, 7)

In [17]:
manual_clusters.to_csv("/Users/adrianahne/workspace/PhD/causality/Causal-associations-diabetes-twitter/data/all_cause_effect_clusters.csv", sep=";")

In [426]:
#manual_clusters.loc[len(manual_clusters.index)+1] = ["", " ".join(effect), np.nan] # Parent name, Cluster name, Synonym
manual_clusters.head(200)

Unnamed: 0,Parent name,Cluster name,Synonyms,cluster_name_prep,cluster_name_lemma,synonym_prep,synonym_lemma
0,Diabetes,diabetes,"diabetic, #diabetic, #diabetes, diabetes melli...",[diabetes],[diabetes],"[[diabetic], [#diabetic], [#diabetes], [diabet...","[[diabetic], [#diabetic], [#diabetes], [diabet..."
1,Diabetes,reverse diabetes,,"[reverse, diabetes]","[reverse, diabete]",,
2,Diabetes,T1D,"type 1 diabetes, type 1, #type1, #type1diabete...",[T1D],[t1d],"[[type, 1, diabetes], [type, 1], [#type1], [#t...","[[type, 1, diabetes], [type, 1], [#type1], [#t..."
3,Diabetes,T2D,"type 2 diabetes, type 2, #type, #type2diabetes...",[T2D],[T2D],"[[type, 2, diabetes], [type, 2], [#type], [#ty...","[[type, 2, diabetes], [type, 2], [#type], [#ty..."
4,Diabetes,diagnosis,,[diagnosis],[diagnosis],,
...,...,...,...,...,...,...,...
112,,kicking doors,,"[kicking, doors]","[kick, door]",,
113,,ICU,,[ICU],[ICU],,
114,,dark patch face,,"[dark, patch, face]","[dark, patch, face]",,
115,,DIED,,[DIED],[DIED],,


### associate each cause / effect to its cluster

In [7]:
all_clusters = pd.read_csv("/Users/adrianahne/workspace/PhD/causality/Causal-associations-diabetes-twitter/data/Causes_effects_clusters - automatically derived.csv")
#manual_clusters = manual_clusters[["Parent name", "Cluster name", "Synonyms"]].dropna(subset=["Cluster name"])
#manual_clusters.reset_index(drop=True, inplace=True)
print(all_clusters.shape)
all_clusters.head()


(1751, 8)


Unnamed: 0.1,Unnamed: 0,Parent name,Cluster name,Synonyms,cluster_name_prep,cluster_name_lemma,synonym_prep,synonym_lemma
0,0.0,Diabetes,diabetes,"diabetic, #diabetic, #diabetes, diabetes melli...",['diabetes'],['diabetes'],"[['diabetic'], ['#diabetic'], ['#diabetes'], [...","[['diabetic'], ['#diabetic'], ['#diabetes'], [..."
1,1.0,Diabetes,reverse diabetes,"reversed, cured overnight\n","['reverse', 'diabetes']","['reverse', 'diabete']",,
2,2.0,Diabetes,T1D,"type 1 diabetes, type 1, #type1, #type1diabete...",['T1D'],['t1d'],"[['type', '1', 'diabetes'], ['type', '1'], ['#...","[['type', '1', 'diabetes'], ['type', '1'], ['#..."
3,3.0,Diabetes,T2D,"type 2 diabetes, type 2, #type, #type2diabetes...",['T2D'],['T2D'],"[['type', '2', 'diabetes'], ['type', '2'], ['#...","[['type', '2', 'diabetes'], ['type', '2'], ['#..."
4,4.0,Diabetes,diagnosis,,['diagnosis'],['diagnosis'],,


In [18]:
all_clusters["cluster_name_prep"], all_clusters["cluster_name_lemma"] = zip(*all_clusters["Cluster name"].map(lambda x: lemmatize(x, mode="clusters")))
all_clusters["synonym_prep"], all_clusters["synonym_lemma"] = zip(*all_clusters["Synonyms"].map(lambda x: lemmatize(x, mode="cluster_synonyms")))
all_clusters.head()


Unnamed: 0.1,Unnamed: 0,Parent name,Cluster name,Synonyms,cluster_name_prep,cluster_name_lemma,synonym_prep,synonym_lemma
0,0.0,Diabetes,diabetes,"diabetic, #diabetic, #diabetes, diabetes melli...",[diabetes],[diabetes],"[[diabetic], [#diabetic], [#diabetes], [diabet...","[[diabetic], [#diabetic], [#diabetes], [diabet..."
1,1.0,Diabetes,reverse diabetes,"reversed, cured overnight\n","[reverse, diabetes]","[reverse, diabete]","[[reversed], [cured, overnight]]","[[reversed], [cure, overnight]]"
2,2.0,Diabetes,T1D,"type 1 diabetes, type 1, #type1, #type1diabete...",[T1D],[t1d],"[[type, 1, diabetes], [type, 1], [#type1], [#t...","[[type, 1, diabetes], [type, 1], [#type1], [#t..."
3,3.0,Diabetes,T2D,"type 2 diabetes, type 2, #type, #type2diabetes...",[T2D],[T2D],"[[type, 2, diabetes], [type, 2], [#type], [#ty...","[[type, 2, diabetes], [type, 2], [#type], [#ty..."
4,4.0,Diabetes,diagnosis,,[diagnosis],[diagnosis],,


In [35]:
import warnings
warnings.filterwarnings("ignore") 


def calc_cosine_sim(causeOrEffect, causeOrEffect_lemma, cluster_name, cluster_name_lemma):
    """ Test several configurations (lowercased, without 'diabetes') to achieve highest cosine similarity """

    causeOrEffect_vector = np.array([model[word] for word in causeOrEffect]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause
    causeOrEffect_lemma_vector = np.array([model[word] for word in causeOrEffect_lemma]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause
    
    causeOrEffect_withoutDiabetes_vector = False
    if len(causeOrEffect) > 1:
        if "diabetic" in causeOrEffect:
            causeOrEffect.remove("diabetic")                
        if "diabetes" in causeOrEffect:
            causeOrEffect.remove("diabetes")
        causeOrEffect_withoutDiabetes_vector = np.array([model[word] for word in causeOrEffect]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause
        causeOrEffect_withoutDiabetes_lemma_vector = np.array([model[word] for word in causeOrEffect_lemma]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause


    cluster_name_vector = np.array([model[word] for word in cluster_name]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause
    cluster_name_lemma_vector = np.array([model[word] for word in cluster_name_lemma]).mean(axis=0)#.reshape(1, -1)  # vector of identified cause
    
    ll = []        
    sim = np.dot(causeOrEffect_vector, cluster_name_vector) / ( np.linalg.norm(causeOrEffect_vector) * np.linalg.norm(cluster_name_vector) ) #1 - cosine(causeOrEffect_vector, cluster_name_vector)
    ll.append( (sim, causeOrEffect, 0) )

    sim = np.dot(causeOrEffect_lemma_vector, cluster_name_lemma_vector) / ( np.linalg.norm(causeOrEffect_lemma_vector) * np.linalg.norm(cluster_name_lemma_vector) ) #1 - cosine(causeOrEffect_lemma_vector, cluster_name_lemma_vector)# lemmatization
    ll.append( (sim, causeOrEffect_lemma, 1) )
    
    if not isinstance(causeOrEffect_withoutDiabetes_vector, bool):
        sim = np.dot(causeOrEffect_withoutDiabetes_vector, cluster_name_vector) / ( np.linalg.norm(causeOrEffect_withoutDiabetes_vector) * np.linalg.norm(cluster_name_vector) ) #1 - cosine(causeOrEffect_withoutDiabetes_vector, cluster_name_vector)
        ll.append( (sim, causeOrEffect, 2) )      
    
        sim = np.dot(causeOrEffect_withoutDiabetes_lemma_vector,cluster_name_lemma_vector) / ( np.linalg.norm(causeOrEffect_withoutDiabetes_lemma_vector) * np.linalg.norm(cluster_name_lemma_vector) ) #1 - cosine(causeOrEffect_withoutDiabetes_lemma_vector, cluster_name_lemma_vector)
        ll.append( (sim, causeOrEffect_lemma, 3) )            

    bestSim, bestCauseOrEffect, bestType = max(ll, key=lambda item:item[0])


    return bestSim, causeOrEffect#bestCauseOrEffect


def find_closest_cluster(cause_or_effect, cause_or_effect_prep, cause_or_effect_lemmatized, clusters, t1=None):

    similarities = []
    if cause_or_effect_prep: # if non-empty
        for i, row in clusters.iterrows(): # loop over all clusters
                if not (isinstance(row["Synonyms"], float) and np.isnan(row["Synonyms"]) ): # if there are synonyms
                    if (isinstance(row["synonym_prep"], float) or isinstance(row["synonym_lemma"], float) ):
                        print(row["Synonyms"])
                        print(row["synonym_prep"])
                        print(row["synonym_lemma"])
                    for synonym, synonym_lemma in zip(row["synonym_prep"], row["synonym_lemma"]):
                        sim, causeOrEffect = calc_cosine_sim(cause_or_effect_prep, cause_or_effect_lemmatized, synonym, synonym_lemma)
                        similarities.append((causeOrEffect, synonym, row['Cluster name'], sim))
                      
                sim, causeOrEffect = calc_cosine_sim(cause_or_effect_prep, cause_or_effect_lemmatized, row["cluster_name_prep"], row["cluster_name_lemma"])
                similarities.append((causeOrEffect, row["cluster_name_prep"], row['Cluster name'], sim))

        bestCauseOrEffect, bestClusterSynonym, bestClusterName, bestSim = sorted(similarities, key=lambda tup: tup[3], reverse=True)[0] # take only highest sim
        return bestCauseOrEffect, bestClusterSynonym, bestClusterName, bestSim
    else:
        return "", "", "", -1.0


import time


start = time.time()
df_new["cluster_cause"] = ""
df_new["cluster_effect"] = ""
for j in np.arange(2000, 50000, 2000):
    for i, row in df_new[j:j+2000].iterrows():
        if i % 1000 == 0:
            print("\n\n ############# j:", j,"| i:", i, "(", time.time()-start, "s)", " ##########\n\n")
        cause, cluster_synonym_cause, cluster_name_cause, sim_cause = find_closest_cluster(row["causes"], row["causes_prep"], row["causes_lemma"], all_clusters)
        #print("\n\tcause:", cause, " | cluster_name_cause:", cluster_name_cause, " | sim_cause:", sim_cause)
        #if sim_cause > 0.0:
        if sim_cause > 0.55: # associate found cluster to the cause; second condition if sim_cause == -1 returned
            df_new.loc[i, "cluster_cause"] = cluster_name_cause

        else: # create new cluster
            print("No close cluster?? cause:", cause, "  | sim:", sim_cause, " | cluster_name:", cluster_name_cause)

        effect, cluster_synonym_effect, cluster_name_effect, sim_effect = find_closest_cluster(row["effects"], row["effects_prep"], row["effects_lemma"], all_clusters)    
        #print("\teffect:", effect, " | cluster_name_effect:", cluster_name_effect, " | sim_effect:", sim_effect)
        #if sim_effect > 0.0:
        if sim_effect > 0.55: # associate found cluster to the cause; second condition if sim_cause == -1 returned
            df_new.loc[i, "cluster_effect"] = cluster_name_effect
        else: # create new cluster
            print("No close cluster ?? effect:", effect, "  | sim:", sim_effect, " | cluster_name:", cluster_name_effect)    
    
    df_new[j:j+2000].to_csv("network_predictions/tweets_clusters_{}_{}.csv".format(j, j+2000), sep=";")
    
#  new node for < 0.55
end = time.time()
print("Time:", end-start)




 ############# j: 2000 | i: 2000 ( 0.009665966033935547 s)  ##########




 ############# j: 2000 | i: 3000 ( 2828.6509211063385 s)  ##########




 ############# j: 4000 | i: 4000 ( 35143.51403713226 s)  ##########




 ############# j: 4000 | i: 5000 ( 48263.59558105469 s)  ##########


No close cluster ?? effect:    | sim: -1.0  | cluster_name: 


 ############# j: 6000 | i: 6000 ( 50965.338816165924 s)  ##########


No close cluster ?? effect:    | sim: -1.0  | cluster_name: 


 ############# j: 6000 | i: 7000 ( 54146.92596197128 s)  ##########


No close cluster?? cause:    | sim: -1.0  | cluster_name: 
No close cluster ?? effect:    | sim: -1.0  | cluster_name: 


 ############# j: 8000 | i: 8000 ( 57706.51382398605 s)  ##########


No close cluster ?? effect:    | sim: -1.0  | cluster_name: 
No close cluster ?? effect: ['costs', '1400', 'dollars']   | sim: 0.5255981  | cluster_name: finance
No close cluster?? cause:    | sim: -1.0  | cluster_name: 


 ############# j: 8000 | i

In [25]:

    
df_new[1995:2005].head(10)

Unnamed: 0,text,tokenized,io_tags,causes,effects,cluster_cause,cluster_effect,causes_prep,causes_lemma,effects_prep,effects_lemma
1995,I have diabetes and I 'm voting for Biden so T...,"[I, have, diabetes, and, I, 'm, voting, for, B...","[O, O, I-C, O, O, O, O, O, O, O, O, O, O, O, O...",diabetes,insulin,diabetes,insulin,[diabetes],[diabetes],[insulin],[insulin]
1996,I have diabetes and I 'm voting for Biden so T...,"[I, have, diabetes, and, I, 'm, voting, for, B...","[O, O, I-C, O, O, O, O, O, O, O, O, O, O, O, O...",diabetes,suffering,diabetes,suffer,[diabetes],[diabetes],[suffering],[suffering]
1997,"@USER My money is on """""""" he died of ( pick on...","[@USER, My, money, is, on, """", he, died, of, (...","[O, O, O, O, O, O, O, I-E, O, O, O, O, O, O, O...",diabetes,died,diabetes,death,[diabetes],[diabetes],[died],[die]
1998,"@USER My money is on """""""" he died of ( pick on...","[@USER, My, money, is, on, """", he, died, of, (...","[O, O, O, O, O, O, O, I-E, O, O, O, O, O, O, O...",hypertension,died,hypertension,death,[hypertension],[hypertension],[died],[die]
1999,it 's bad enough corona hitting the black comm...,"[it, 's, bad, enough, corona, hitting, the, bl...","[O, O, O, O, O, O, O, O, O, O, I-E, I-E, I-E, ...",diabetes,"pre,existing,conditions",diabetes,prediabetes,[diabetes],[diabetes],"[pre, existing, conditions]","[pre, exist, condition]"
2000,it 's bad enough corona hitting the black comm...,"[it, 's, bad, enough, corona, hitting, the, bl...","[O, O, O, O, O, O, O, O, O, O, I-E, I-E, I-E, ...",diabetes,obesity,,,[diabetes],[diabetes],[obesity],[obesity]
2001,@USER My wife is at high risk of serious compl...,"[@USER, My, wife, is, at, high, risk, of, seri...","[O, O, O, O, O, O, O, O, O, I-E, O, O, O, I-C,...","type,one,diabetic",complications,,,"[type, one, diabetic]","[type, one, diabetic]",[complications],[complication]
2002,@USER My wife is at high risk of serious compl...,"[@USER, My, wife, is, at, high, risk, of, seri...","[O, O, O, O, O, O, O, O, O, I-E, O, O, O, I-C,...","type,one,diabetic",fear,,,"[type, one, diabetic]","[type, one, diabetic]",[fear],[fear]
2003,Just look at this thread of diabetics who 've ...,"[Just, look, at, this, thread, of, diabetics, ...","[O, O, O, O, O, O, O, O, O, I-E, O, O, O, O, I...","withheld,insulin",died,,,"[withheld, insulin]","[withheld, insulin]",[died],[die]
2004,God is Good my uncle that had vivid 19 in a di...,"[God, is, Good, my, uncle, that, had, vivid, 1...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, O,...","diabetic,coma","brain,dead",,,"[diabetic, coma]","[diabetic, coma]","[brain, dead]","[brain, dead]"


In [26]:
df_new[0:2000].to_csv("network_predictions/tweets_clusters_0_2000.csv", sep=";")

In [439]:
df_new.sample(n=300, random_state=9).head(20)

Unnamed: 0,text,tokenized,io_tags,causes,effects,causes_prep,causes_lemma,effects_prep,effects_lemma,cluster_cause,cluster_effect
4413,when i was diagnosed w diabetes my friends tol...,"[when, i, was, diagnosed, w, diabetes, my, fri...","[O, O, O, O, O, I-C, O, O, O, O, O, O, O, O, I...",diabetes,"lot,of,pressure",[diabetes],[diabetes],"[lot, pressure]","[lot, pressure]",diabetes,hypotension
92975,Having to manage 3 cases of stomatitis and now...,"[Having, to, manage, 3, cases, of, stomatitis,...","[O, O, O, O, O, O, O, O, O, O, I-C, O, O, O, O...",diabetic,shock,[diabetic],[diabetic],[shock],[shock],diabetes,shock
15197,@USER Explain how when seeing a annual eye tes...,"[@USER, Explain, how, when, seeing, a, annual,...","[O, O, O, O, O, O, O, I-E, I-E, O, O, O, O, O,...","#T1D,#diabetes","eye,test","[#T1D, #diabetes]","[#T1D, #diabetes]","[eye, test]","[eye, test]",diabetes,OGTT
26659,I read a post this morning about a young lady ...,"[I, read, a, post, this, morning, about, a, yo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","cost,of,insulin",stressed,"[cost, insulin]","[cost, insulin]",[stressed],[stress],insulin prices,stress
98008,Some months ago im taking a pill for my insuli...,"[Some, months, ago, im, taking, a, pill, for, ...","[O, O, O, O, O, O, O, O, O, I-E, O, O, O, O, I...",diabetes,insuline,[diabetes],[diabetes],[insuline],[insuline],diabetes,insulin
128256,"People can complain about depression , anxiety...","[People, can, complain, about, depression, ,, ...","[O, O, O, O, I-E, O, O, O, O, I-E, O, O, O, O,...","type,1,diabetic",sick,"[type, 1]","[type, 1, diabetic]",[sick],[sick],T1D,sick
29321,Off to Wal Mart on my way home from work to pi...,"[Off, to, Wal, Mart, on, my, way, home, from, ...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, I-...","pick,up,diabetic,foot,cream",bread,"[pick, foot, cream]","[pick, diabetic, foot, cream]",[bread],[bread],legs swollen,nutrition
64793,My glucose test is next week maybe I should st...,"[My, glucose, test, is, next, week, maybe, I, ...","[O, I-C, I-C, O, O, O, O, O, O, I-E, I-E, O, O...","glucose,test","start,fasting","[glucose, test]","[glucose, test]","[start, fasting]","[start, fast]",OGTT,fasting
127780,I 'm doing a blood glucose curve on Pais today...,"[I, 'm, doing, a, blood, glucose, curve, on, P...","[O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, I-E...","blood,glucose,curve","checking,his,BG","[blood, glucose, curve]","[blood, glucose, curve]","[checking, BG]","[check, BG]",hypo,glucose
76385,I 've lost several members of my family to typ...,"[I, 've, lost, several, members, of, my, famil...","[O, O, I-E, I-E, I-E, O, O, O, O, I-C, I-C, I-...","severe,obesity","lost,several,members","[severe, obesity]","[severe, obesity]","[lost, several, members]","[lose, several, member]",overweight,neuropathy


In [460]:
!conda install scipy -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [479]:
import numpy as np
import scipy.sparse as sp
from scipy.spatial.distance import squareform, pdist, cosine, cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# Create an adjacency matrix
np.random.seed(42)
a = np.random.randint(0, 10, (1,10000)).ravel()
b = np.random.randint(0, 10, (1,10000)).ravel()


# Define a function to calculate the cosine similarities a few different ways
def calc_sim(a,b, method=1):
    if method == 1:
        return 1 - cosine(a,b)
    if method == 2:
        
        return np.dot(a,b) / ( np.linalg.norm(a) * np.linalg.norm(b) )
    if method == 3:
        return cosine_similarity(a.reshape(1,-1),b.reshape(1,-1))
    if method == 4:
        return(1 - cosine_similarity(a,b))


# Time them:
print("Method 1")
%timeit calc_sim(a,b, method=1)
print("Method 2")
%timeit calc_sim(a,b, method=2)
print("Method 3")
%timeit calc_sim(a,b, method=3)
print("Method 4")
%timeit calc_sim(a,b, method=4)

Method 1
114 µs ± 18.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Method 2
59.6 µs ± 5.65 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Method 3
417 µs ± 97.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Method 4


NameError: name 'spatial' is not defined

In [33]:
for i in np.arange(0, 50000, 2000):
    print(i)

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000


In [None]:
np.arange()