In [7]:
import pandas as pd
import numpy as np
import matplotlib as plt
import glob
import os
import ast # converts string of list -> list
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [2]:
################# MODEL PARAMETERS #####################
path_we = "/home/adrian/PhD/Data/FastText_embedding_20190703/ft_wordembeddings_dim300_minCount5_URL-User-toConstant_iter10_20190703"
dataPath = "result_cause_effect_prediction"
csv_files = glob.glob(os.path.join(dataPath, "*.csv"))
len(csv_files)


266

In [3]:

################## LOAD DATA ######################
tuples = []
for file in csv_files:
    with open(file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i == 0: # header
                if line.endswith("\n"):
                    line = line[:-2]
                header = line.split(",")[1:]
            else:
                index, ll = line.split(",", 1)
                ll, io_tags = ll.rsplit("[", 1)
                if io_tags.endswith('\n'):
                    io_tags = io_tags[:-2] # remove \n
                elif io_tags.endswith('"'):
                    io_tags = io_tags[:-1]
                io_tags = ast.literal_eval("["+io_tags)

                text, tokenized = ll.rsplit(',"[', 1)
                tokenized = tokenized[:-3].replace('""', '"')
                tokenized = ast.literal_eval("["+tokenized)

                if text.endswith(',"'):
                    text = text[:-2]
                if text.startswith('"'):
                    text = text[1:]
                if text.endswith('"'):
                    text = text[:-1]

                tuples.append((index, text, tokenized, io_tags))
            
print(len(tuples))

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(tuples, columns=["index","text", "tokenized", "io_tags"])
df.head()

In [27]:
def extract_causes_effects(row):
    """ Extracts causes and effects from the io_tags column and saves them in new columns """
    #print()
    #print(row["text"])
    #for tag, tok in zip(row["io_tags"], row["tokenized"]):
    #    print(tok, tag)
    if len(row["tokenized"]) != len(row["io_tags"]):
        print("ERROR: Tokenized and tags are not of same length!")
    
    causes = ""
    effects = ""
    last_tag_cause_index = -2
    last_tag_effect_index = -2
    for i, tag in enumerate(row.io_tags):
        if tag == "I-C":
            if i == last_tag_cause_index + 1: # True, if cause consist of several consecutive words
                causes += ","+row["tokenized"][i]
            else: # cause is a new separate cause
                causes += ";"+row["tokenized"][i]
            last_tag_cause_index = i # helps to test if causes and effects consist of CONSECUTIVE words

        elif tag == "I-E":
            if i == last_tag_effect_index + 1: # True, if cause consist of several consecutive words
                effects += ","+row["tokenized"][i]
            else: # cause is a new separate cause
                effects += ";"+row["tokenized"][i]
            last_tag_effect_index = i # helps to test if causes and effects consist of CONSECUTIVE words
    
    if causes.startswith(",") or causes.startswith(";"):
        causes = causes[1:]
    if effects.startswith(",") or effects.startswith(";"):
        effects = effects[1:]        
    #print("causes:", causes)
    #print("effects:", effects)
    
    

    
    return pd.Series([row.index, row.text, row.tokenized, row.io_tags, causes, effects]
                     , index=["index", "text", "tokenized", "io_tags", "causes", "effects"])

df_with_causes_effects = df.apply(extract_causes_effects, axis=1)

In [29]:
df_with_causes_effects.head()

Unnamed: 0,index,text,tokenized,io_tags,causes,effects
0,"Index(['index', 'text', 'tokenized', 'io_tags'...",@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried
1,"Index(['index', 'text', 'tokenized', 'io_tags'...",Back to the #robotic life and I 'm good with t...,"[Back, to, the, #robotic, life, and, I, 'm, go...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, I-...","#libre,#freestylelibre,#abbott,#t1d,#diabetes,...",
2,"Index(['index', 'text', 'tokenized', 'io_tags'...",@USER I can't take such medications anymore ca...,"[@USER, I, can't, take, such, medications, any...","[O, O, O, O, O, O, O, O, O, O, O, I-C, I-C, O,...","diabetic,condition",
3,"Index(['index', 'text', 'tokenized', 'io_tags'...",If Brexit kills me because insulin is made in ...,"[If, Brexit, kills, me, because, insulin, is, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,
4,"Index(['index', 'text', 'tokenized', 'io_tags'...","cool thing here , im only not diabetic anymore...","[cool, thing, here, ,, im, only, not, diabetic...","[O, O, O, O, O, O, I-C, I-C, O, O, O, O, O, O,...","not,diabetic",


In [31]:
###################### STORE RESULTS FOR EASIER ACCESS #####################
#del df_with_causes_effects["index"]
#df_with_causes_effects.to_parquet("result_cause_effect_prediction_all.parquet")

In [3]:
########### LOAD DATA ###################
df_with_causes_effects = pd.read_parquet("result_cause_effect_prediction_all.parquet")
df_with_causes_effects.shape

(265328, 5)

In [4]:
#################### Only consider tweets with both cause and effect ###############################
df_new = df_with_causes_effects[(df_with_causes_effects.causes.str.len() > 0 ) & (df_with_causes_effects.effects.str.len() > 0)]
print(df_new.shape)
df_new.head()

(96676, 5)


Unnamed: 0,text,tokenized,io_tags,causes,effects
0,@USER There will be no shortages of insulin or...,"[@USER, There, will, be, no, shortages, of, in...","[O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, O, ...","shortages,of,insulin",worried
6,someone 's a type II diabetic and they basical...,"[someone, 's, a, type, II, diabetic, and, they...","[O, O, O, I-C, I-C, I-C, O, O, O, I-E, I-E, I-...","type,II,diabetic","can't,eat,food"
10,"I would use it to fix my truck , keep bills pa...","[I, would, use, it, to, fix, my, truck, ,, kee...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",diabetes,"deformed,my,feet"
11,@USER I want to get rid of the insulin too bec...,"[@USER, I, want, to, get, rid, of, the, insuli...","[O, O, O, O, I-E, I-E, I-E, I-E, I-E, O, O, O,...",stress,"get,rid,of,the,insulin;diet"
14,Just drunk that nasty ass shit for this glucos...,"[Just, drunk, that, nasty, ass, shit, for, thi...","[O, O, O, O, O, O, O, O, I-C, I-C, O, O, O, I-...","glucose,test","feel,sick,:nauseated_face:"


In [5]:
########## load FastText vectors #####################
from gensim.models.fasttext import FastText
model = FastText.load(path_we)

In [15]:
model["covid"]

  model["covid"]


array([-2.70179212e-02,  1.01174168e-01,  7.47505575e-02, -8.35368559e-02,
       -1.36312932e-01, -2.19370052e-01, -5.16138114e-02,  4.31144461e-02,
       -2.32713014e-01,  2.21285205e-02, -1.30236894e-01, -2.58882493e-01,
        1.80163637e-01,  2.54356116e-01,  3.56314853e-02,  6.18745722e-02,
        2.45554343e-01, -1.68844342e-01,  2.37276196e-01,  5.85153960e-02,
       -3.53905335e-02, -2.71545555e-02,  1.50960371e-01, -1.29856586e-01,
        1.39056653e-01,  3.26678082e-02,  2.47021884e-01,  8.69530812e-03,
       -3.26550543e-01,  2.25983009e-01, -1.79292001e-02,  1.03522889e-01,
        1.20839305e-01, -1.83511570e-01,  2.53076814e-02, -1.13044731e-01,
       -1.33140430e-01, -1.92382466e-02,  3.14717531e-01, -3.98586988e-02,
        7.51585364e-02, -1.54217675e-01, -1.17491163e-01,  1.55892566e-01,
       -1.21717587e-01, -2.24911854e-01,  7.40474239e-02,  1.53372005e-01,
       -4.94552888e-02, -3.61340567e-02, -4.31217030e-02,  2.08787676e-02,
       -1.79226518e-01, -

In [6]:
# Take random causes + effects to cluster manually
#df_new.sample(n=1000, random_state=0).effects.values.tolist()

In [43]:
# remove stopwords
# lowercase
manual_clusters = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causes_effects_clusters.xlsx")
manual_clusters = manual_clusters[["Parent name", "Cluster name", "Synonyms"]].dropna(subset=["Cluster name"])
manual_clusters.head(20)

Unnamed: 0,Parent name,Cluster name,Synonyms
0,Diabetes,Diabetes,"diabetic, #diabetic, #diabetes"
1,Diabetes,reverse diabetes,
2,Diabetes,T1D,"Type 1 diabetes, Type 1, #type1, #type1diabetes"
3,Diabetes,T2D,"Type 2 diabetes, Type 2, #type, #type2diabetes"
4,Diabetes,diagnosis,
6,Insulin,insulin,
7,Insulin,rationing insulin,"shortage insulin, denying insulin, lack insulin"
8,Insulin,Unable to afford insulin,"can't afford insulin, no access to affordable ..."
9,Insulin,affordable insulin,afford insulin
10,Insulin,access insulin,


In [None]:
def addCenterVector(row):
    """ calculates mean (center) vector cluster name and its synonyms """

    vectors = [    ]
    cause_vector = np.array([model[word] for word in filtered_cause]).mean(axis=0)


manual_clusters_vec = manual_clusters.apply(addCenterVector, axis=1)

In [24]:
for i, row in df_new[0:1].iterrows():
    print("\nrow causes:", row["causes"])
    #print("\nrow effects:", row["effects"])
    causes = row["causes"].split(";")
    effects = row["effects"].split(";")
    print("\tcauses:", causes)
    #print("\effects:", effects)
    for cause in causes:
        filtered_cause = [word for word in cause.split(",") if word not in stopwords.words('english')]
        try:
            print("model:", model[filtered_cause[0]].shape)
            cause_vector = np.array([model[word] for word in filtered_cause]).mean(axis=0)
            print(cause_vector.shape)
        except:
            print("xx")
        print("\t\t", filtered_cause)


row causes: shortages,of,insulin
	causes: ['shortages,of,insulin']
model: (300,)
()
		 ['shortages', 'insulin']


  print("model:", model[filtered_cause[0]].shape)
  cause_vector = np.mean([model[word] for word in filtered_cause])


In [25]:
filtered_cause

['shortages', 'insulin']

  np.array([model[word] for word in filtered_cause]).mean(axis=0)


array([ 0.29214457,  0.38495776,  0.16574703,  0.06405926, -0.11539381,
       -0.03385283,  0.21286406, -0.3960556 ,  0.3066109 , -0.08814646,
       -0.35494047, -0.33379677, -0.17764051, -0.0933546 ,  0.36670458,
        0.16885853,  0.29699555, -0.11082231, -0.2744066 ,  0.22896041,
        0.14868608,  0.05572787,  0.5637683 , -0.10456495,  0.05183126,
        0.469559  , -0.0110492 , -0.04377677, -0.3260107 ,  0.04131457,
        0.02726132,  0.41873205, -0.24648839, -0.24606623, -0.30768356,
       -0.47113594, -0.14119108, -0.043651  , -0.27086866, -0.19397582,
        0.10647982,  0.32150462,  0.10593531,  0.54683185,  0.20449293,
        0.40418303, -0.30780703,  0.5326033 , -0.11721453, -0.17469238,
        0.26809037,  0.10372459, -0.29117787,  0.01172423, -0.3602302 ,
        0.67002994,  0.5728597 ,  0.07263549,  0.04857959,  0.08066833,
       -0.0438829 , -0.03891828, -0.01713083, -0.10556807,  0.17395777,
       -0.08545314,  0.39364538, -0.3122859 ,  0.4729639 ,  0.37