#### Apply cause-effect model (named-entity recognition task) on all causal sentences

In [1]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import os
import torch
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
import glob
import joblib


bert_model = "vinai/bertweet-base" # "bert-large-uncased"; "roberta-large"


##### DATA TO LOAD ######
dataDir = "/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/results_causal_sentences"
csv_files = glob.glob(os.path.join(dataDir, "*.csv"))

result_dir = "results_cause_effect_emotional_tweets"

# Always predicts batch_size many tweets and stores result in result_dir
batch_size =  10000 #65552 #1024   #512 # # 65552 #32776 #16388 #8194 #2048

cause_effect_model = joblib.load("./model_causal-sentences/bertEmbeddings_simpleCRF.pkl")

In [2]:
tokenizer = AutoTokenizer.from_pretrained(bert_model, padding = "max_length", truncation = True, max_length = 60, return_offsets_mapping=True )
model = AutoModel.from_pretrained(bert_model)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
## load data

tuples = []
for file in csv_files:
    with open(file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i == 0: # header
                if line.endswith("\n"):
                    line = line[:-2]
                header = line.split(",")[1:]
            else:
                index, ll = line.split(",", 1)
                ll, prob = ll.rsplit(",", 1)
                if prob.endswith("\n"):
                    prob = prob[:-2]
                text, pred = ll.rsplit(",", 1)
                if text.startswith('"') and text.endswith('"'):
                    text = text[1:-1]
                    #text = text[]
                    #print(text)

                tuples.append((text, pred, prob))

print("N tweets from file:", len(tuples))

df = pd.DataFrame(tuples, columns=["text", "pred", "proba"])
df.pred = pd.to_numeric(df.pred)
df.proba = pd.to_numeric(df.proba)
print(df.pred.value_counts())
df.head()

N tweets from file: 1024
1    564
0    460
Name: pred, dtype: int64


Unnamed: 0,text,pred,proba
0,Hypo in the Boticelli rooms at the @USER and o...,1,0.66097
1,Life & diabetes in a jar altogether !,0,0.241259
2,It 's always worrying as a diabetic when your ...,0,0.355484
3,Mmm I love bacon and hardly any carbs for brea...,1,0.553529
4,* frantically applies for jobs so that i can a...,0,0.492331


In [23]:
########### take only causal sentences & tokenize ###########
df_causal = df[df["pred"] == 1].sample(n=100, random_state=0)  # SAMPLE ONLY FOR TESTING
df_causal["tokenized"] = df_causal["text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
print("N causal sentences:", df_causal.shape[0])
df_causal.head()

N causal sentences: 100


Unnamed: 0,text,pred,proba,tokenized
468,To help change the way the NHS treats #type2di...,1,0.685685,"[To, help, change, the, way, the, NHS, treats,..."
602,"Okay but on the real , I almost lost my life T...",1,0.696993,"[Okay, but, on, the, real, ,, I, almost, lost,..."
388,"@USER Mental Health , keep my diabetes in chec...",1,0.70401,"[@USER, Mental, Health, ,, keep, my, diabetes,..."
607,"I 'm a 58 yr old man , with diabetes and a str...",1,0.52821,"[I, ', m, a, 58, yr, old, man, ,, with, diabet..."
767,Update ( since I made a surprising number of y...,1,0.5793,"[Update, (, since, I, made, a, surprising, num..."


In [25]:
#df_causal.to_parquet("data/causal_emotional_sentences.parquet")

In [26]:
df_causal = pd.read_parquet("data/causal_emotional_sentences.parquet")
df_causal.shape

(281664, 4)

In [16]:
########################### Check if cuda available ############################
print("Cuda available: ", torch.cuda.is_available())
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print("Selected {} for this notebook".format(device))

Cuda available:  False
Selected cpu for this notebook


In [3]:
################## GET BERT EMBEDDINGS + OTHER FEATURES ###############################""
def get_word_embeddings(sentence, sentence_tokenised):
    """ Get word embeddings for each word in sentence """
    ids = tokenizer.encode(sentence) 
    ids_tensor = torch.tensor(ids).unsqueeze(0) # Batch size: 1
    word_vectors = model(ids_tensor)[0].squeeze()
    
    word_embeddings_all = []
    for word in sentence_tokenised: # average word embeddings of sub-tokens 
        word_encoded = tokenizer.encode(word)
        word_encoded.remove(tokenizer.cls_token_id)
        word_encoded.remove(tokenizer.sep_token_id)
        
        try:
            word_indices = [ids.index(encoded_id) for encoded_id in word_encoded ] 
        except ValueError: 
            print("sentence:", sentence)
            print("tokenized:", sentence_tokenised)
            print("ids:", ids)
            print()
            print("word:", word)
            print("word_encoded:", word_encoded)
            #print("encoded_id:", encoded_id)
            print("\n\n")
        
        # average all sub_word vectors of word
        word_vector = torch.zeros((768))
        for sub_token_id in word_indices:
            word_vector += word_vectors[sub_token_id]
        word_vector /= len(word_indices)
        
        word_embeddings_all.append(word_vector)
        
    return word_embeddings_all

def word2features(word, i, wordembedding):

    features = {
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }

    # here you add 300 (fastText) / 768 (Bert) features (one for each vector component)
    for iv,value in enumerate(wordembedding):
        features['v{}'.format(iv)]=value
    
    return features



def sent2features(sentence, tokenized):
    word_vectors = get_word_embeddings(sentence, tokenized)
    return [word2features(tokenized[i], i, word_vectors[i]) for i in range(len(tokenized))]

X_train = [sent2features(sentence, tokenized) for sentence, tokenized in zip(df_causal.text.values.tolist(), df_causal.tokenized.values.tolist())]
print("X_train:", len(X_train), len(X_train[0]))



In [25]:
for i in range(0, df_causal.shape[0], batch_size):
    print(i)
    tweet_subset = df_causal[i:i+batch_size]
    X = [sent2features(sentence, tokenized) 
         for sentence, tokenized in zip(tweet_subset.text.values.tolist()
                                    , tweet_subset.tokenized.values.tolist())]
    
    predictions = cause_effect_model.predict(X)
    cause_effect_DF = pd.DataFrame({"text":tweet_subset.text,
                             "tokenized": tweet_subset.tokenized,
                             "predictions": predictions})
cause_effect_DF

0
sentence: @USER Mental Health , keep my diabetes in check , but also to meet some frankly awesome people along the way , and I love watching other runners achieve their goals , it 's very fulfilling .
tokenized: ['@USER', 'Mental', 'Health', ',', 'keep', 'my', 'diabetes', 'in', 'check', ',', 'but', 'also', 'to', 'meet', 'some', 'frankly', 'awesome', 'people', 'along', 'the', 'way', ',', 'and', 'I', 'love', 'watching', 'other', 'runners', 'achieve', 'their', 'goals', ',', 'it', "'", 's', 'very', 'fulfilling', '.']
ids: [0, 5, 13150, 1960, 7, 264, 23, 15015, 16, 560, 7, 42, 237, 9, 681, 109, 22362, 742, 83, 1041, 6, 154, 7, 13, 8, 71, 406, 185, 17034, 5708, 130, 2032, 7, 18, 20, 249, 29366, 4, 2]

word: '
word_encoded: [69]



sentence: @USER Mental Health , keep my diabetes in check , but also to meet some frankly awesome people along the way , and I love watching other runners achieve their goals , it 's very fulfilling .
tokenized: ['@USER', 'Mental', 'Health', ',', 'keep', 'my', 'd

sentence: Do they teach diabetes specialists in medical school that they 're professionally mandated to make all their patients feel like shit or
tokenized: ['Do', 'they', 'teach', 'diabetes', 'specialists', 'in', 'medical', 'school', 'that', 'they', "'", 're', 'professionally', 'mandated', 'to', 'make', 'all', 'their', 'patients', 'feel', 'like', 'shit', 'or']
ids: [0, 172, 59, 2592, 15015, 38263, 16, 3541, 230, 25, 59, 81, 30926, 54243, 9, 115, 48, 130, 5546, 174, 43, 145, 72, 2]

word: '
word_encoded: [69]



sentence: Do they teach diabetes specialists in medical school that they 're professionally mandated to make all their patients feel like shit or
tokenized: ['Do', 'they', 'teach', 'diabetes', 'specialists', 'in', 'medical', 'school', 'that', 'they', "'", 're', 'professionally', 'mandated', 'to', 'make', 'all', 'their', 'patients', 'feel', 'like', 'shit', 'or']
ids: [0, 172, 59, 2592, 15015, 38263, 16, 3541, 230, 25, 59, 81, 30926, 54243, 9, 115, 48, 130, 5546, 174, 43, 145, 72

sentence: He is suffering from diabetes and every year claims lakhs to get treated in Jibdal 's but here he is enjoying Ice cream .
tokenized: ['He', 'is', 'suffering', 'from', 'diabetes', 'and', 'every', 'year', 'claims', 'lakhs', 'to', 'get', 'treated', 'in', 'Jibdal', "'", 's', 'but', 'here', 'he', 'is', 'enjoying', 'Ice', 'cream', '.']
ids: [0, 162, 17, 5812, 53, 15015, 13, 234, 189, 3903, 34244, 5983, 9, 51, 4114, 16, 611, 14379, 27789, 20, 42, 137, 58, 17, 3359, 4746, 2074, 4, 2]

word: '
word_encoded: [69]



sentence: He is suffering from diabetes and every year claims lakhs to get treated in Jibdal 's but here he is enjoying Ice cream .
tokenized: ['He', 'is', 'suffering', 'from', 'diabetes', 'and', 'every', 'year', 'claims', 'lakhs', 'to', 'get', 'treated', 'in', 'Jibdal', "'", 's', 'but', 'here', 'he', 'is', 'enjoying', 'Ice', 'cream', '.']
ids: [0, 162, 17, 5812, 53, 15015, 13, 234, 189, 3903, 34244, 5983, 9, 51, 4114, 16, 611, 14379, 27789, 20, 42, 137, 58, 17, 3359, 4746,

sentence: @USER I 'm on 600mg a day and it helps with the nerve pain from diabetes in my feet and the nerve pain from 3 herniated disks in my back .
tokenized: ['@USER', 'I', "'", 'm', 'on', '600mg', 'a', 'day', 'and', 'it', 'helps', 'with', 'the', 'nerve', 'pain', 'from', 'diabetes', 'in', 'my', 'feet', 'and', 'the', 'nerve', 'pain', 'from', '3', 'herniated', 'disks', 'in', 'my', 'back', '.']
ids: [0, 5, 8, 40, 24, 13896, 14551, 11, 93, 13, 18, 2820, 30, 6, 9573, 1182, 53, 15015, 16, 23, 1802, 13, 6, 9573, 1182, 53, 163, 3506, 2115, 1269, 1215, 1799, 16, 23, 107, 4, 2]

word: '
word_encoded: [69]



sentence: @USER I 'm on 600mg a day and it helps with the nerve pain from diabetes in my feet and the nerve pain from 3 herniated disks in my back .
tokenized: ['@USER', 'I', "'", 'm', 'on', '600mg', 'a', 'day', 'and', 'it', 'helps', 'with', 'the', 'nerve', 'pain', 'from', 'diabetes', 'in', 'my', 'feet', 'and', 'the', 'nerve', 'pain', 'from', '3', 'herniated', 'disks', 'in', 'my', 'back', 

sentence: I 'd live on bread and water before I 'd let my horrible neighbour die for lack of insulin .
tokenized: ['I', "'", 'd', 'live', 'on', 'bread', 'and', 'water', 'before', 'I', "'", 'd', 'let', 'my', 'horrible', 'neighbour', 'die', 'for', 'lack', 'of', 'insulin', '.']
ids: [0, 8, 224, 294, 24, 3878, 13, 791, 213, 8, 224, 197, 23, 2638, 19115, 668, 19, 2717, 15, 46884, 4, 2]

word: '
word_encoded: [69]



sentence: I 'd live on bread and water before I 'd let my horrible neighbour die for lack of insulin .
tokenized: ['I', "'", 'd', 'live', 'on', 'bread', 'and', 'water', 'before', 'I', "'", 'd', 'let', 'my', 'horrible', 'neighbour', 'die', 'for', 'lack', 'of', 'insulin', '.']
ids: [0, 8, 224, 294, 24, 3878, 13, 791, 213, 8, 224, 197, 23, 2638, 19115, 668, 19, 2717, 15, 46884, 4, 2]

word: d
word_encoded: [614]



sentence: I 'd live on bread and water before I 'd let my horrible neighbour die for lack of insulin .
tokenized: ['I', "'", 'd', 'live', 'on', 'bread', 'and', 'water', 

sentence: It 's hard not to be frustrated when people with diabetes are dying because the price of insulin has increased 1100 % , way higher than inflation , and they can't afford it .
tokenized: ['It', "'", 's', 'hard', 'not', 'to', 'be', 'frustrated', 'when', 'people', 'with', 'diabetes', 'are', 'dying', 'because', 'the', 'price', 'of', 'insulin', 'has', 'increased', '1100', '%', ',', 'way', 'higher', 'than', 'inflation', ',', 'and', 'they', "can't", 'afford', 'it', '.']
ids: [0, 76, 20, 301, 46, 9, 31, 9904, 64, 83, 30, 15015, 41, 2000, 153, 6, 1727, 15, 46884, 90, 6638, 31787, 221, 7, 154, 2663, 149, 19896, 7, 13, 59, 129, 3971, 18, 4, 2]

word: '
word_encoded: [69]



sentence: It 's hard not to be frustrated when people with diabetes are dying because the price of insulin has increased 1100 % , way higher than inflation , and they can't afford it .
tokenized: ['It', "'", 's', 'hard', 'not', 'to', 'be', 'frustrated', 'when', 'people', 'with', 'diabetes', 'are', 'dying', 'because',

sentence: A month 's supply of insulin would cost me over $ 900 / month without my insurance even though it costs around $ 100 to make a YEAR'S SUPPLY FOR ONE PERSON !
tokenized: ['A', 'month', "'", 's', 'supply', 'of', 'insulin', 'would', 'cost', 'me', 'over', '$', '900', '/', 'month', 'without', 'my', 'insurance', 'even', 'though', 'it', 'costs', 'around', '$', '100', 'to', 'make', 'a', "YEAR'S", 'SUPPLY', 'FOR', 'ONE', 'PERSON', '!']
ids: [0, 104, 795, 20, 5788, 15, 46884, 86, 2187, 27, 141, 144, 11405, 75, 795, 384, 23, 4379, 132, 298, 18, 5126, 284, 144, 550, 9, 115, 11, 624, 11553, 3746, 28209, 39082, 829, 1546, 13420, 12, 2]

word: '
word_encoded: [69]



sentence: A month 's supply of insulin would cost me over $ 900 / month without my insurance even though it costs around $ 100 to make a YEAR'S SUPPLY FOR ONE PERSON !
tokenized: ['A', 'month', "'", 's', 'supply', 'of', 'insulin', 'would', 'cost', 'me', 'over', '$', '900', '/', 'month', 'without', 'my', 'insurance', 'even', 'th

sentence: i know there 's a lot of misinformation about diabetes out there , so if you 're ever curious , i do n't mind talking about it :smiling_face_with_smiling_eyes:
tokenized: ['i', 'know', 'there', "'", 's', 'a', 'lot', 'of', 'misinformation', 'about', 'diabetes', 'out', 'there', ',', 'so', 'if', 'you', "'", 're', 'ever', 'curious', ',', 'i', 'do', "n't", 'mind', 'talking', 'about', 'it', ':', 'smiling_face_with_smiling_eyes', ':']
ids: [0, 37, 68, 99, 20, 11, 318, 15, 33351, 62, 15015, 50, 99, 7, 39, 65, 14, 81, 179, 5002, 7, 37, 32, 29, 410, 401, 62, 18, 518, 2]

word: '
word_encoded: [69]



sentence: i know there 's a lot of misinformation about diabetes out there , so if you 're ever curious , i do n't mind talking about it :smiling_face_with_smiling_eyes:
tokenized: ['i', 'know', 'there', "'", 's', 'a', 'lot', 'of', 'misinformation', 'about', 'diabetes', 'out', 'there', ',', 'so', 'if', 'you', "'", 're', 'ever', 'curious', ',', 'i', 'do', "n't", 'mind', 'talking', 'about', 

Unnamed: 0,text,tokenized,predictions
468,To help change the way the NHS treats #type2di...,"[To, help, change, the, way, the, NHS, treats,...","[O, O, O, O, O, O, I-C, I-C, I-C, O, O, O, O, ..."
602,"Okay but on the real , I almost lost my life T...","[Okay, but, on, the, real, ,, I, almost, lost,...","[O, O, O, O, O, O, O, O, I-E, I-E, I-E, O, O, ..."
388,"@USER Mental Health , keep my diabetes in chec...","[@USER, Mental, Health, ,, keep, my, diabetes,...","[O, O, O, O, O, O, I-C, O, O, O, O, O, O, O, O..."
607,"I 'm a 58 yr old man , with diabetes and a str...","[I, ', m, a, 58, yr, old, man, ,, with, diabet...","[O, O, O, O, O, O, O, O, O, O, I-C, O, O, O, I..."
767,Update ( since I made a surprising number of y...,"[Update, (, since, I, made, a, surprising, num...","[O, O, O, O, O, O, O, O, O, O, I-E, O, O, O, O..."
...,...,...,...
415,@USER Oh I am so sorry :( My father also has m...,"[@USER, Oh, I, am, so, sorry, :(, My, father, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
200,@USER Well its meant to be good for you but I ...,"[@USER, Well, its, meant, to, be, good, for, y...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
710,Mad mfs got AIDS in ATL every time I go to the...,"[Mad, mfs, got, AIDS, in, ATL, every, time, I,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
361,@USER Bet he would have got more if he was a m...,"[@USER, Bet, he, would, have, got, more, if, h...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [26]:
cause_effect_DF.to_csv("/home/adrian/Downloads/cause_effect_preds_part_{}.csv".format(i), sep=";")


### Read some examples

In [28]:
aaa = pd.read_csv("/home/adrian/Downloads/cause_effect_preds_part_0.csv", sep=";")
aaa.head()

Unnamed: 0.1,Unnamed: 0,text,tokenized,predictions
0,468,To help change the way the NHS treats #type2di...,"['To', 'help', 'change', 'the', 'way', 'the', ...","['O', 'O', 'O', 'O', 'O', 'O', 'I-C', 'I-C', '..."
1,602,"Okay but on the real , I almost lost my life T...","['Okay', 'but', 'on', 'the', 'real', ',', 'I',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-E'..."
2,388,"@USER Mental Health , keep my diabetes in chec...","['@USER', 'Mental', 'Health', ',', 'keep', 'my...","['O', 'O', 'O', 'O', 'O', 'O', 'I-C', 'O', 'O'..."
3,607,"I 'm a 58 yr old man , with diabetes and a str...","['I', ""'"", 'm', 'a', '58', 'yr', 'old', 'man',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,767,Update ( since I made a surprising number of y...,"['Update', '(', 'since', 'I', 'made', 'a', 'su...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [30]:
bbb = pd.read_csv("/home/adrian/Downloads/cause_effect_predictions_part_0.csv", sep=";")
bbb.head()

for i,row in bbb[0:50].iterrows():
    print(row["text"])
    print(row["tokenized"])
    print(row["predictions"])
    print()

Hypo in the Boticelli rooms at the @USER and one of the staff members offered me a sweet cause he knew my pain #type1diabetes
['Hypo', 'in', 'the', 'Boticelli', 'rooms', 'at', 'the', '@USER', 'and', 'one', 'of', 'the', 'staff', 'members', 'offered', 'me', 'a', 'sweet', 'cause', 'he', 'knew', 'my', 'pain', '#type1diabetes']
['I-E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-C', 'I-C']

Mmm I love bacon and hardly any carbs for breakfast :raising_hands: :light_skin_tone: :smiling_face_with_sunglasses: #breakfastoftheday #diabetes #diabetic ...
['Mmm', 'I', 'love', 'bacon', 'and', 'hardly', 'any', 'carbs', 'for', 'breakfast', ':raising_hands:', ':light_skin_tone:', ':smiling_face_with_sunglasses:', '#breakfastoftheday', '#diabetes', '#diabetic', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

@USER "" That does n't mean we should take care of the person who sits at home , eats po

In [21]:
for tokens, predicts in zip(df_causal.tokenized, predictions):
    print("\n")
    for token, predic in zip(tokens, predicts):
        print(token, predic)



@USER O
I O
could O
really O
use O
help O
my O
mom O
has O
Parkinsons O
disease O
and O
dementia O
and O
diabetes I-C
she O
just O
suffered O
a O
long O
term O
stroke O
and O
my O
grandma O
has O
Lamonia O
bad O
i O
can't I-C
afford I-C
thier I-C
medicine I-C
we O
do O
n't O
have O
no O
food O
either O
please O
anything O
helps O
on O
behalf O
of O
them O
anything O
helps O
$ O
IIKINGGPOPPII O


So O
now O
that O
my O
grandpa O
has O
diabetes I-C
we O
are O
buying O
him O
all O
his O
medications O
& O
I O
' O
m O
just O
so O
in O
shock O
and O
disappointed O
how O
much O
it O
costs O
. O


@USER O
@USER O
He O
could O
also O
slip O
into O
diabetic I-C
ketoacidosis I-C
which O
brings O
on O
tremors O
and O
even O
rage O
also O
potentially O
death O
. O


I O
hate I-E
hormones I-E
: O
face_with_steam_from_nose O
: O
#pms O
#type1diabetes O
#hormones O
#justgoaway O
HTTPURL O


Turns O
out O
he O
had O
COPD O
, O
that O
plus O
his O
diabetes I-C
caused O
him O
to O
have O
a O
cardiac I-

#MS I-E
, I-E
#RA I-E
, I-E
#Fibromyalgia I-E
, I-E
#diabetes I-E
, O
& O
#COPD O
. O


“ O
stabbing O
pains O
” O
aka O
diabetic I-C
neuropathy I-C
, O
he O
will O
most O
likely I-E
never I-E
feel I-E
the I-E
same I-E
way I-E
in O
his O
hands O
again O
! O


I O
do O
n't O
think O
I O
' O
m O
a O
bad O
person O
, O
but O
I O
did O
pee O
in O
a O
diabetic I-C
guys O
bird O
bath O
once O
. O


He O
died I-E
in O
his O
sleep O
, O
coroner O
reported O
" O
" O
due O
to O
complications I-C
with I-C
diabetes I-C
. O


@USER O
@USER O
@USER O
It O
saddens O
me O
to O
think O
there O
are O
diabetics I-C
rationing I-C
their I-C
insulin I-C
, O
forgoing O
meals O
as O
a O
form O
of O
blood I-E
sugar I-E
management I-E
and O
being O
made O
to O
feel I-E
guilt I-E
over O
their O
condition O
. O


I O
was O
prediabetic O
all O
through O
high O
school O
all O
through O
the O
Navy O
and O
during O
my O
pregnancies O
with O
all O
4 O
of O
my O
children O
. O


@USER O
@USER O
Great O
idea O
my O
elde

## Check performances 

In [7]:
##### DATA TO LOAD ######
from ast import literal_eval

dataPath = "data/cause_effect_sentences_with_IO_tags.csv"

data = pd.read_csv(dataPath, sep=";", converters={"tokenized":literal_eval, "bio_tags":literal_eval})
data = data[(data["Cause"].notnull()) & (data["Effect"].notnull())].sample(n=200)

print(data.shape)

data.head()

(200, 7)


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
1201,I forgot to change my pump this morning and I ...,,forgot to change my pump,high blood sugar,1.0,"[I, forgot, to, change, my, pump, this, mornin...","[O, I-C, I-C, I-C, I-C, I-C, O, O, O, O, O, O,..."
2007,I got prescribed contrave by my doctor and its...,mC;mE,contrave;metformin,increased my metabolism;shed 20 pounds,1.0,"[I, got, prescribed, contrave, by, my, doctor,...","[O, O, O, I-C, O, O, O, O, O, O, O, O, O, I-C,..."
2331,@USER I ' m sure those who can't afford their ...,,can't afford their insulin,ration it,1.0,"[@USER, I, ', m, sure, those, who, can't, affo...","[O, O, O, O, O, O, O, I-C, I-C, I-C, I-C, O, O..."
1394,"I look at it like this , all that sugar leads ...",mE,sugar,diabetes;obesity;dental issues,1.0,"[I, look, at, it, like, this, ,, all, that, su...","[O, O, O, O, O, O, O, O, O, I-C, O, O, I-E, O,..."
2284,She is 8 she went blind last summer bc of diab...,,diabetes,blind,1.0,"[She, is, 8, she, went, blind, last, summer, b...","[O, O, O, O, O, I-E, O, O, O, O, I-C, O, O, O,..."


In [8]:
X_data = [sent2features(sentence, tokenized) for sentence, tokenized in zip(data.sentence.values.tolist(), data.tokenized.values.tolist())]


In [10]:
cause_effect_model = joblib.load("./model-causal-span/bertEmbeddings_simpleCRF.pkl")
predictions = cause_effect_model.predict(X_data)

In [12]:
for tokens, true_labels, predicts in zip(data.tokenized, data.bio_tags, predictions):
    print("\n")
    for token, true_label, predic in zip(tokens, true_labels, predicts):
        print(token, "true:", true_label, "predic:", predic)



I true: O predic: O
forgot true: I-C predic: O
to true: I-C predic: O
change true: I-C predic: O
my true: I-C predic: O
pump true: I-C predic: O
this true: O predic: O
morning true: O predic: O
and true: O predic: O
I true: O predic: O
did true: O predic: O
n't true: O predic: O
grab true: O predic: O
any true: O predic: O
insulin true: O predic: O
so true: O predic: O
now true: O predic: O
I true: O predic: O
have true: O predic: O
10 true: O predic: O
units true: O predic: O
to true: O predic: O
get true: O predic: O
me true: O predic: O
through true: O predic: O
the true: O predic: O
day true: O predic: O
when true: O predic: O
I true: O predic: O
usually true: O predic: O
go true: O predic: O
through true: O predic: O
like true: O predic: O
50 true: O predic: O
so true: O predic: O
I true: O predic: O
'm true: O predic: O
going true: O predic: O
to true: O predic: O
have true: O predic: O
high true: I-E predic: I-E
blood true: I-E predic: I-E
sugar true: I-E predic: I-E
all true:

and true: O predic: O
diabetes true: I-C predic: I-C
. true: O predic: O


USER true: O predic: O
People true: O predic: O
die true: I-E predic: I-E
in true: O predic: O
the true: O predic: O
USA true: O predic: O
because true: O predic: O
they true: O predic: O
can't true: I-C predic: I-C
get true: I-C predic: I-C
insulin true: I-C predic: I-C
. true: O predic: O


I true: O predic: O
really true: O predic: O
wish true: O predic: O
Diabetes true: I-C predic: I-C
Type true: I-C predic: I-C
1 true: I-C predic: I-C
insulin true: I-C predic: I-C
and true: I-C predic: I-C
equipment true: I-C predic: I-C
was true: I-C predic: I-C
less true: I-C predic: I-C
pricey true: I-C predic: I-C
, true: O predic: O
im true: O predic: O
worried true: I-E predic: I-E
for true: O predic: O
my true: O predic: O
sister true: O predic: O
x true: O predic: O
( true: O predic: O


@USER true: O predic: O
I true: O predic: O
wish true: O predic: O
I true: O predic: O
could true: O predic: O
help true: O predic

on true: O predic: O
the true: O predic: O
left true: O predic: O
spoke true: O predic: O
about true: O predic: O
how true: O predic: O
difficult true: I-E predic: I-E
living true: I-E predic: I-E
with true: O predic: O
#T1D true: I-C predic: I-C
is true: O predic: O
. true: O predic: O


It true: O predic: O
's true: O predic: O
world true: O predic: O
diabetes true: O predic: O
day true: O predic: O
, true: O predic: O
so true: O predic: O
, true: O predic: O
being true: O predic: O
the true: O predic: O
diabetic true: I-C predic: I-C
that true: O predic: O
i true: O predic: O
am true: O predic: O
, true: O predic: O
it true: O predic: O
's true: O predic: O
all true: O predic: O
about true: O predic: O
me true: O predic: O
and true: O predic: O
my true: O predic: O
insulin true: I-E predic: I-C
pump true: I-E predic: I-C
today true: O predic: O


If true: O predic: O
you true: O predic: O
drink true: I-C predic: I-C
wine true: I-C predic: I-C
in true: O predic: O
the true: O predic:

to true: O predic: O
get true: O predic: O
some true: O predic: O
insulin true: I-E predic: O
off true: O predic: O
a true: O predic: O
stranger true: O predic: O
Suspect true: O predic: O
I true: O predic: O
have true: O predic: O
a true: O predic: O
dodgy true: O predic: O
batch true: O predic: O
& true: O predic: O
urgent true: O predic: O
pharmacy true: O predic: O
have true: O predic: O
the true: O predic: O
same true: O predic: O
batch true: O predic: O
as true: O predic: O
me true: O predic: O
Have true: O predic: O
done true: O predic: O
everything true: O predic: O
possible true: O predic: O
to true: O predic: O
bring true: I-C predic: O
BGL true: I-C predic: I-C
down true: I-C predic: I-C
new true: O predic: O
everything true: O predic: O
over true: O predic: O
last true: O predic: O
24h true: O predic: O
without true: O predic: O
success true: O predic: O
TG true: O predic: O
I true: O predic: O
' true: O predic: O
m true: O predic: O
not true: O predic: O
susceptible true: 

. true: O predic: O


@USER true: O predic: O
after true: O predic: O
my true: O predic: O
mom true: O predic: O
legs true: I-E predic: I-E
were true: I-E predic: I-E
cut true: I-E predic: I-E
off true: I-E predic: I-E
because true: O predic: O
of true: O predic: O
diabetes true: I-C predic: I-C
, true: O predic: O
she true: O predic: O
was true: O predic: O
always true: O predic: O
crying true: I-E predic: I-E
but true: O predic: O
because true: O predic: O
of true: O predic: O
you true: O predic: O
jakezyrus true: O predic: O
and true: O predic: O
your true: O predic: O
amazing true: O predic: O
voice true: O predic: O
she true: O predic: O
' true: O predic: O
s true: O predic: O
still true: O predic: O
fighting true: O predic: O
. true: O predic: O


USER true: O predic: O
USER true: O predic: O
The true: O predic: O
trouble true: O predic: O
is true: O predic: O
, true: O predic: O
there true: O predic: O
are true: O predic: O
people true: O predic: O
who true: O predic: O
can true

Just true: O predic: O
waiting true: O predic: O
for true: O predic: O
my true: O predic: O
depression true: I-E predic: I-C
type true: I-C predic: I-C
1 true: I-C predic: I-C
diabetes true: I-C predic: I-C
and true: O predic: O
a true: O predic: O
care true: O predic: O
to true: O predic: O
give true: O predic: O
like true: O predic: O
.. true: O predic: O
I true: O predic: O
seriously true: O predic: O
want true: O predic: O
to true: O predic: O
hide true: O predic: O
in true: O predic: O
shame true: O predic: O
right true: O predic: O
now true: O predic: O
. true: O predic: O


my true: O predic: O
family true: I-C predic: O
' true: O predic: O
s true: O predic: O
heirloom true: O predic: O
is true: O predic: O
diabetes true: I-E predic: I-C
and true: O predic: O
anger true: O predic: O
issues true: O predic: O
/: true: O predic: O


I true: O predic: O
have true: O predic: O
a true: O predic: O
glucose true: I-C predic: I-C
test true: I-C predic: I-C
in true: O predic: O
a true: O 

me true: O predic: O
healthy true: O predic: O
and true: O predic: O
ALIVE true: I-E predic: I-E
. true: O predic: O


My true: O predic: O
nephew true: O predic: O
was true: O predic: O
diagnosed true: O predic: O
with true: O predic: O
type true: I-C predic: I-C
1 true: I-C predic: I-C
diabetes true: I-C predic: I-C
at true: O predic: O
age true: O predic: O
3 true: O predic: O
, true: O predic: O
and true: O predic: O
we true: O predic: O
had true: O predic: O
to true: O predic: O
crowdsource true: O predic: O
funds true: O predic: O
to true: O predic: O
pay true: O predic: I-E
for true: O predic: I-E
his true: O predic: I-E
insulin true: I-E predic: I-E
pump true: I-E predic: I-E
, true: O predic: O
even true: O predic: O
with true: O predic: O
medical true: O predic: O
insurance true: O predic: O
. true: O predic: O


I true: O predic: O
can't true: O predic: O
help true: O predic: O
that true: O predic: O
I true: O predic: O
was true: O predic: O
diagnosed true: O predic: O
with 

giving true: O predic: O
myself true: O predic: O
.. true: O predic: O
I true: O predic: O
do true: O predic: O
n't true: O predic: O
know true: O predic: O
how true: O predic: O
my true: O predic: O
#diabetic true: O predic: O
sister true: O predic: O
does true: O predic: O
it true: O predic: O
day true: O predic: O
in true: O predic: O
day true: O predic: O
out true: O predic: O
:weary_face: true: O predic: O


Insulin true: I-C predic: I-C
is true: O predic: O
just true: O predic: O
the true: O predic: O
tip true: O predic: O
, true: O predic: O
you true: O predic: O
need true: O predic: O
meter true: I-C predic: O
, true: O predic: O
batteries true: I-C predic: I-C
, true: O predic: O
lancets true: I-C predic: O
, true: O predic: O
test true: I-C predic: I-C
strips true: I-C predic: I-C
, true: O predic: O
alcohol true: I-C predic: I-C
wipe true: I-C predic: I-C
, true: O predic: O
syringes true: I-C predic: O
, true: O predic: O
CRAZY true: O predic: O
expensive true: I-E predic: 

which true: O predic: O
exempts true: O predic: O
me true: O predic: O
from true: O predic: O
all true: O predic: O
medical true: O predic: I-E
costs true: O predic: I-E
. true: O predic: O


Was true: O predic: O
pre true: I-C predic: I-C
diabetic true: I-C predic: I-C
heading true: O predic: O
into true: O predic: O
diabetes true: I-E predic: I-C
a true: O predic: O
few true: O predic: O
years true: O predic: O
back true: O predic: O
which true: O predic: O
is true: O predic: O
what true: O predic: O
prompted true: O predic: O
me true: O predic: O
to true: O predic: O
start true: O predic: O
... true: O predic: O


It true: O predic: O
did true: O predic: O
n't true: O predic: O
stop true: O predic: O
the true: O predic: O
diabetes true: I-C predic: I-C
from true: O predic: O
murdering true: I-E predic: I-E
my true: I-E predic: I-E
pancreas true: I-E predic: I-E
to true: O predic: O
the true: O predic: O
point true: O predic: O
where true: O predic: O
I true: O predic: O
became true:

has true: O predic: O
it true: O predic: O
cause true: O predic: O
of true: O predic: O
their true: O predic: O
genes true: I-C predic: I-C
. true: O predic: O


Always true: O predic: O
and true: O predic: O
the true: O predic: O
heat true: I-C predic: I-C
( true: O predic: O
sometimes true: O predic: O
) true: O predic: O
makes true: O predic: O
my true: O predic: O
bloods true: I-E predic: I-E
drop true: I-E predic: I-E
rapidly true: I-E predic: I-E
: true: O predic: O
thinking_face true: O predic: O
: true: O predic: O
#T1D true: I-C predic: I-C
#Heatwave true: I-C predic: I-C
HTTPURL true: O predic: O


So true: O predic: O
, true: O predic: O
I true: O predic: O
get true: O predic: O
it true: O predic: O
, true: O predic: O
but true: O predic: O
I true: O predic: O
' true: O predic: O
m true: O predic: O
really true: O predic: O
getting true: I-E predic: I-E
sick true: I-E predic: I-E
of true: O predic: I-E
diet true: I-C predic: I-E
discussions true: I-C predic: I-E
when true: O

failure true: I-E predic: I-E
& true: O predic: O
needs true: O predic: O
dialysis true: I-E predic: I-E
multiple true: O predic: O
times true: O predic: O
a true: O predic: O
week true: O predic: O
. true: O predic: O
