### IO-tagging

A cause-effect dataset has been labeled. Transform each tweets into its IO-tagging scheme

For instance the tweet: 
`Prediabetes forced me to change my lifestyle`
with cause: `Prediabetes`
and effect: `change my lifestyle` 

will be transformed into the IO-tagging scheme:

```
Prediabetes  forced  me  to  change  my  lifestyle
   I-C         O      O   O    I-E   I-E   I-E
```

In [1]:
import pandas as pd
import numpy as np
from utils import normalizeTweet, split_into_sentences, bio_tagging, EarlyStopping, manual_tagging_of_some_special_tweets


########################### DATA FILE ###################################
dataPath = "data/Causality_tweets_data.xlsx"


In [2]:
##### DATA TO LOAD ######
data_round0 = pd.read_excel(dataPath, sheet_name="round0")
data_round0 = data_round0[data_round0["Causal association"].notnull()] # some tweets at the end are not labeled yet
data_round0 = data_round0[["full_text", "Intent", "Cause", "Effect", "Causal association"]]
print("Data round 0 (tweets!):")
print(data_round0["Causal association"].value_counts())
print("-----"*5)


##### additional data labeled through active learning strategy - round 1 ########
data_round1 = pd.read_excel(dataPath, sheet_name="round1")
data_round1 = data_round1[data_round1["Causal association"].notnull()]
data_round1 = data_round1[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round1.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("Sentences round 1:")
print(data_round1["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 2 ########
data_round2 = pd.read_excel(dataPath, sheet_name="round2")
data_round2 = data_round2[data_round2["Causal association"].notnull()]
data_round2 = data_round2[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round2.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 2:")
print(data_round2["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 3 ########
data_round3 = pd.read_excel(dataPath, sheet_name="round3")
data_round3 = data_round3[data_round3["Causal association"].notnull()]
data_round3 = data_round3[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round3.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 3:")
print(data_round3["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 4 ########
data_round4 = pd.read_excel(dataPath, sheet_name="round4")
data_round4 = data_round4[data_round4["Causal association"].notnull()]
data_round4 = data_round4[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round4.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 3:")
print(data_round3["Causal association"].value_counts())

#### merge both datasets ######
data = data_round0.append(data_round1).append(data_round2).append(data_round3).append(data_round4)
print("\nAfter merge old data:")
print(data["Causal association"].value_counts())
data.head()



Data round 0 (tweets!):
0.0    3710
1.0    1290
Name: Causal association, dtype: int64
-------------------------
Sentences round 1:
0.0    1763
1.0     429
Name: Causal association, dtype: int64
-------------------------
sentences round 2:
0    1658
1     150
Name: Causal association, dtype: int64
-------------------------
sentences round 3:
0    1886
1     215
Name: Causal association, dtype: int64
-------------------------
sentences round 3:
0    1886
1     215
Name: Causal association, dtype: int64

After merge old data:
0.0    10912
1.0     2397
Name: Causal association, dtype: int64


Unnamed: 0,full_text,Intent,Cause,Effect,Causal association
0,"tonight , I learned my older girl will back he...",,,,0.0
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0


In [3]:
data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
data.head()

Unnamed: 0,full_text,Intent,Cause,Effect,Causal association,tokenized
0,"tonight , I learned my older girl will back he...",,,,0.0,"[tonight, ,, I, learned, my, older, girl, will..."
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,"[USER, USER, I, knew, diabetes, and, fibromyal..."
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI..."
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,"[USER, Cheers, !, Have, one, for, this, diabet..."
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,"[USER, Additionally, the, medicines, are, bein..."


In [13]:
def io_tagging(tweet, causes, effects):
    """
    Each token gets associated to one of the following labels:
    I-C : Inside cause
    I-C : Inside effect
    O   : Outside
    """

    tokens = normalizeTweet(tweet).split(" ")
    causes = str(causes).strip().split(";")
    effects = str(effects).strip().split(";")
    io = ["O"] * len(tokens)

    # if no cause and no effect return IO tags of "O"
    if (not causes and not effects) or (causes == ["nan"] and effects == ["nan"]):
        return io

    # if only cause and no effect
    if (causes or causes != ["nan"]) and not (effects or effects !=["nan"]):
        print("ERROR: only cause and no effect exists\n\n -------------\n\n")

    # if only effect and no cause
    if not (causes or causes != ["nan"]) and  (effects or effects !=["nan"]):
        print("ERROR: only effect and no cause exists\n\n -------------\n\n")


    ########### SPECIAL CASES (Some causes or effects may occur several times in a tweet. Check here, if the current tweet is such a tweet and return the manually pre-labeled bio tag) ############
    manual_io_tag = manual_tagging_of_some_special_tweets(tweet, tokens, io, ioTagging=True)
    # usually 'io' has only "O", except if it got altered in manual_tagging_of_some_special_tweets
    if manual_io_tag.count("O") != len(manual_io_tag):
        return manual_io_tag


    ################## Add IO tags for causes and effects ########################

    for cause in causes: # possible to have several causes

        cause_words = normalizeTweet(cause).split(" ") # a cause may consist of several words
        cause_words_start = cause_words[0]
        try:
            ### Find index of first word of cause -> label with "I-C"
            indices = [i for i, x in enumerate(tokens) if x == cause_words_start] # get all indices of the first word of the cause
            N_cause_words = len(cause_words)
            if len(indices) > 1 and N_cause_words > 1: # if several occurrences of the same cause start word in phrase
                for cause_word_start_index in indices:
                    causeIndexFound = all([tokens[cause_word_start_index+word_i] == cause_words[word_i] for word_i in range(N_cause_words)])
                    if causeIndexFound:
                        ind = cause_word_start_index
                        break
            else:
                ind = tokens.index(cause_words_start) # get index of causal word in tokens list
            io[ind] = "I-C"

            ### If cause consists of several words -> label those words with "I-C"
            i = 1
            while i < len(cause_words):
                if tokens[ind+i] == cause_words[i]:
                    io[ind+i] = "I-C"
                else:
                    print("Error: token and causal word don't match!\Tind:", ind, "i:", i, "token[ind+i]:", tokens[ind+i], "cause_words[i]:", cause_words[i])
                i += 1
        except ValueError:
            print("\nINFO: cause word '{}' does not exist in sentence: \n'{}', but should be in other sentence of the tweet".format(cause_words_start, tokens))


    for effect in effects: # possible to have several effects

        effect_words = normalizeTweet(effect).split(" ") # a effect may consist of several words
        effect_words_start = effect_words[0]
        try:
            ### Find index of first word of effect -> label with "I-E"
            indices = [i for i, x in enumerate(tokens) if x == effect_words_start]
            N_effect_words = len(effect_words)
            if len(indices) > 1 and N_effect_words > 1: # if several occurrences of the same cause start word in phrase
                for effect_word_start_index in indices:
                    effectIndexFound = all([tokens[effect_word_start_index+word_i] == effect_words[word_i] for word_i in range(N_effect_words)])
                    if effectIndexFound:
                        ind = effect_word_start_index
                        break
            else:
                ind = tokens.index(effect_words_start) # get index of c_word in tokens list
            io[ind] = "I-E"

            ### If effect consists of several words -> label those words with "I-E"
            i = 1
            while i < len(effect_words):
                if tokens[ind+i] == effect_words[i]:
                    io[ind+i] = "I-E"
                else:
                    print("Error: token and effect word don't match! \tind:", ind, "i:", i, "token[ind+i]", tokens[ind+i], "effect_words[i]", cause_words[i])
                i += 1

        except ValueError:
            print("\nError: effect word '{}' does not exist in sentence: \n'{}', but should be in other sentence of the tweet".format(effect_words_start, tokens))

    return io


data["bio_tags"] = data.apply(lambda row: io_tagging(row["full_text"],row["Cause"], row["Effect"]), axis=1)

#for i, row in data[data["Causal association"] == 1].sample(n=100).iterrows():
#    io_tags = io_tagging(row["full_text"],row["Cause"], row["Effect"])
#    print("\n\nCause:", row["Cause"])
#    print("Effect:", row["Effect"])
#    for tok, io in zip(row["tokenized"], io_tags):
#        print(tok, "\t", io)
data.head()    

Unnamed: 0,full_text,Intent,Cause,Effect,Causal association,tokenized,io_tags,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0.0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,"[USER, Cheers, !, Have, one, for, this, diabet...","[O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O]"
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,"[USER, Additionally, the, medicines, are, bein...","[O, O, O, I-C, I-C, I-C, I-C, I-C, I-C, O, O, ...","[O, O, O, I-C, I-C, I-C, I-C, I-C, I-C, O, O, ..."


In [14]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized", "bio_tags"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))
        
        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]
                         , "bio_tags": row["bio_tags"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                sentence_bio_tags = row["bio_tags"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)

    return newDF
       

In [15]:
### Split tweets into sentences (train classifier on sentence level) ####

print("N tweets:", data.shape[0])
dataSentences = split_tweets_to_sentences(data)
print("N sentences:", dataSentences.shape[0])
dataSentences.head()

N tweets: 13309
N sentences: 20065


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,Fiercely .,,,,0,"[Fiercely, .]","[O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:],[O]


In [16]:
########## Remove sentences with joke, question, negation and keep only sentences with more than 3 tokens #####

print("N sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] # remove sentences with joke, q, neg
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) >= 3] # only keep sentences with at least 3 words
print("N sentences after filtering: ", dataSentFiltered.shape[0])
dataSentFiltered.head()

N sentences before filtering:  20065
N sentences after filtering:  16868


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]","[O, O, O, O, O, O]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [17]:
# choose sentences with cause or effect
trainingData = dataSentFiltered[(dataSentFiltered["Cause"].notnull()) | (dataSentFiltered["Effect"].notnull())]

trainingData.shape

(2500, 7)

In [18]:
trainingData.to_csv("/home/adrian/Downloads/cause_effect_sentences_with_IO_tags.csv",index=False, sep=";")