### The trained causal sentence model (binary classifier) will be applied on all diabetes related tweets to determine only tweets containing causal information

In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from torch.utils.data import DataLoader, DataLoader
import transformers
from tqdm import tqdm, trange
#from google.colab import drive, files
import io
import sys
from utils import *
from torch.nn.parallel import DistributedDataParallel
import torch.multiprocessing as mp
import torch.distributed as dist

diabetes_keywords = [
"glucose", "#glucose","blood glucose", "#bloodglucose",
"insulin", "#insulin", "insulin pump", "#insulinpump",
"diabetes", "#diabetes", "t1d", "#t1d", "#type1diabetes",
"#type1", "t2d", "#t2d", "#type2diabetes", "#type2",
"#bloodsugar", "#dsma", "#bgnow", "#wearenotwaiting",
"#insulin4all", "dblog", "#dblog", "diyps", "#diyps",
"hba1c", "#hba1c", "#cgm", "#freestylelibre",
"diabetic", "#diabetic", "#gbdoc", "finger prick",
"#fingerprick", "#gestational", "gestational diabetes",
"#gdm", "freestyle libre", "#changingdiabetes",
"continuous glucose monitoring", "#continuousglucosemonitoring",
"#thisisdiabetes", "#lifewithdiabetes", "#stopdiabetes",
"#diabetesadvocate", "#diabadass", "#diabetesawareness",
"#diabeticproblems", "#diaversary", "#justdiabeticthings",
"#diabetestest", "#t1dlookslikeme", "#t2dlookslikeme",
"#duckfiabetes", "#kissmyassdiabetes", "#GBDoc",
"#changingdiabetes", "freestyle libre", "#freestylelibre",
"#cgm"
]

# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer):
        self.text = text
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
        }      

    def __len__(self):
        return len(self.text)


def compute_metrics(pred, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalityBERT(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalityBERT, self).__init__()
        self.num_labels = 2
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token        
        output_2 = self.dropout(output_1)
        output_3 = self.linear1(output_2)  
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        # cross entory will take care of the logits - we don't need if we are usign cross entropy for loss function 
        # if doing yourself - use nll loss and logSoftmax 
#         logit = self.softmax(output_5)
        return output_5



    
softmax = torch.nn.Softmax(-1)
    
#################### MODEL PARAMETERS #####################

#device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')

bert_model = "vinai/bertweet-base"
data_path = "/Users/adrianahne/PhD/Data/matching-tweets_diab_noRT-noDupl_20210128_personal_noJokes_withFullText_emotional.parquet" 
causal_model = "./model_causal-sentences/model_4_finetuned-8-epochs-lr_1e-05.pth"
save_path = "causalSentences.csv"

# VIVEK: Choose a large batchSize. The batchSize will be splitted over the numer of GPUs
# if 8 GPUs and batch_size 32 => each GPU executes 4 tweets
batch_size = 2048
#world_size = 8 # N GPUs

In [2]:
tokenizer = AutoTokenizer.from_pretrained(bert_model)

model = CausalityBERT()
model.load_state_dict(torch.load(causal_model, map_location='cpu'))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.embeddings.word_embeddings.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.bias', 'roberta.encoder.layer.8.attention.self.value.bias', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.3.attention.output.dense.bias', 'lm_head.bias', 'roberta.encoder.layer.1.attention.self.value.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.intermediate.dense.bias', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.8.intermediate.dense.bias', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.9.atte

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.7.attention.self.query.bias', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.6.output.dense.bias', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.11.output.LayerNorm.bias', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.6.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.5.interm

<All keys matched successfully>

In [5]:
# load data
data = pd.read_parquet(data_path)#.sample(n=100, random_state=33)
print("Total count:", data.shape[0])
data.head()

Total count: 562013


Unnamed: 0,id,created_at,lang,text,user_name,user_screen_name,user_followers_count,user_friends_count,user_location,user_description,...,retweeted_user_friends_count,retweeted_user_location,retweeted_user_description,retweeted_place_full_name,retweeted_text,Project,posted_month,tweet_URL_USER,full_text,emotion
0,862697033131597824,Thu May 11 15:52:55 +0000 2017,en,Hypo in the Boticelli rooms at the @UffiziGall...,Samantha Morris,TheBorgiaBuIl,624.0,570.0,"Southampton, England",Tattooed Renaissance historian and author. Aut...,...,,,,,,Diabetes,201705,Hypo in the Boticelli rooms at the USER and on...,Hypo in the Boticelli rooms at the @UffiziGall...,True
2,862733663443615744,Thu May 11 18:18:28 +0000 2017,en,Life &amp; diabetes in a jar altogether! 😍 I'...,Hasan Abughosh,H_Abughosh,2420.0,1455.0,Hashemite Kingdom of Jordan,Marketing Manager of Easy Jordan | Startup & T...,...,,,,,,Diabetes,201705,Life &amp; diabetes in a jar altogether! 😍 I'...,Life &amp; diabetes in a jar altogether! 😍\n\n...,True
5,862832546945675264,Fri May 12 00:51:24 +0000 2017,en,It's always worrying as a diabetic when your f...,­­­Mitth'raw'nuruodo,MattWWeir,257.0,177.0,,"So please forgive what I have done. No, you ca...",...,,,,,,Diabetes,201705,It's always worrying as a diabetic when your f...,It's always worrying as a diabetic when your f...,True
14,863308516198678529,Sat May 13 08:22:44 +0000 2017,en,Wut about iam my own mum people? They gift tan...,Saad,msaadkb,1050.0,567.0,SC:msaadkb,you'll will never know what i am really trying...,...,,,,,,Diabetes,201705,Wut about iam my own mum people? They gift tan...,"Wut about ""iam my own mum"" people? They gift t...",True
15,863399191497904130,Sat May 13 14:23:03 +0000 2017,en,"If you have type 1 diabetes , you don't deserv...",NastyNana-demonation,LandauCamar,134.0,219.0,New York,"I am WOMAN , I'm NEW YORK & Jewish PROUD. Wife...",...,,,,,,Diabetes,201705,"If you have type 1 diabetes , you don't deserv...","If you have type 1 diabetes , you don't deserv...",True


In [6]:
####### SPLIT TWEETS INTO SENTENCES ######################

TweetsSplit = data["full_text"].map(lambda full_text: split_into_sentences(normalizeTweet(full_text)))
print(TweetsSplit.shape[0])

sentences = TweetsSplit.explode()
print("tweets to sentences:", sentences.shape[0])

print(sentences[20:40].values)

Error: token: M.Phil.B.Ed
['@USER', 'I', 'lost', 'my', 'teaching', 'job', ',', 'due', 'to', 'lockdown', 'I', 'could', 'not', 'go', 'any', 'job', ',', 'as', 'a', 'diabetic', 'need', 'money', 'for', 'my', 'food', 'and', 'medicine', 'kindly', 'help', 'mam', ',', 'instead', 'of', 'that', 'I', "'m", 'ready', 'to', 'sell', 'my', 'M', '.', 'A', ',', 'M.Phil.B.Ed', '.', ',', 'certificates', ',', 'according', 'to', 'your', 'wish', 'fix', 'price', '.']
Error: token: smoked.drank.drugs.never
['@USER', '@USER', '@USER', '@USER', 'I', 'never', 'get', 'the', 'flu', 'HTTPURL', 'diabetic', 'but', 'I', 'never', 'really', 'get', 'sick.infact', 'I', 'really', "can't", 'remember', 'in', 'my', 'adult', 'life', 'wen', 'I', "'ve", 'really', 'gotten', 'sick', '.', 'I', "'ve", 'never', 'ever', 'smoked.drank.drugs.never', '&', 'I', "'m", 'a', 'musician', '&', 'thank', 'god', 'never', 'got', 'caught', 'in', 'that', 'trap', '.']
Error: token: SSE.WRD.SER.ADRA
['R', '/', 'Sir', ',', 'Will', 'retire', 'on', '30.4',

Error: token: stella.waithaka99@gmail.com
['@USER', '@USER', '@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'stella.waithaka99@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: y.o.daughter
['@USER', '@USER', 'I', "'m", '🇨', '🇦', 'ian', 'with', '50', 'y.o.daughter', 'T1D', '41', 'yrs', '.', 'I', 'worried', 'about', 'her', 'having', 'good', 'control', 'when', 'young', 'but', 'never', 'worried', 'about', 'cost', '.', 'She', 'now', 'has', 'a', 'state', 'of', 'the', 'art', 'pump', '&', 'even', 'as', 'a', 'single', 'parent', 'of', '2', 'teens', ',', 'in', 'average', 'job', '&', 'without', 'support', 'from', 'ex', ',', 'insulin

Error: token: fam..paypal.me/zazu12
['@USER', 'Thank', 'you', 'Jo', '..', 'hes', 'doing', 'ok', '..', 'but', 'tomorrow', 'is', '7', 'weeks', 'and', 'time', 'for', 'a', 'new', 'bottle', 'of', 'insulin', '..', 'I', "'m", 'sinking', ',', 'well', 'sunk', '..', 'it', "'s", 'taking', 'a', 'high', 'toll', 'on', 'me', 'and', 'my', 'fam..paypal.me/zazu12', '$', 'DawnFratus', '..', 'I', 'hope', 'all', 'is', 'well', 'w', '/', 'you', 'and', 'yours']
Error: token: .kisongochi@gmail.com
['@USER', '@USER', '@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: .kiso

Error: token: michaelkerry@hotmail.co.uk
['@USER', '@USER', 'Hi', 'I', "'m", 'the', 'mother', 'of', 'Gemma', 'Kerry', ',', 'she', 'is', 'type', '1', 'diabetic', 'it', 'is', 'a', 'long', 'term', 'condition', '.', 'I', 'have', 'applied', 'to', 'make', 'a', 'wish', '.', 'And', 'she', 'has', 'not', 'yet', 'been', 'accepted', '.', 'I', 'know', 'she', 'would', 'love', 'to', 'meet', 'you', '.', 'She', 'has', 'tickets', 'to', 'the', 'Leeds', 'show', 'michaelkerry@hotmail.co.uk']
Error: token: claire.kearns@live.co.uk
['Contact', 'claire.kearns@live.co.uk', '.', 'I', 'only', 'would', 'need', '10', 'mins', 'of', 'your', 'time', 'for', 'a', 'phone', 'recording', '&', 'it', 'would', 'be', 'much', 'appreciated', '!', '#T1D', '#eddmt1']
Error: token: into.a.needle
['I', 'just', 'walked', 'into.a.needle', ',', 'it', 'was', 'atleast', '1cm', 'into', 'my', 'foot', '...', 'my', 'moms', 'reaction', '(', 'fyi', 'she', 'has', 'diabetes', 'and', 'wounds', 'on', 'diabetic', 'feets', 'heal.very', 'badly', ')'

Error: token: c.h.robinson
['Another', 'year', 'in', 'the', 'books', 'doing', 'the', '#T1D', 'walk', 'at', '@USER', 'with', 'my', 'family', '.', 'I', 'am', 'beyond', 'blessed', 'to', 'be', 'able', 'to', 'participate', 'with', 'my', 'mom', ',', 'melzy', '1975', 'and', 'her', 'company', ',', 'c.h.robinson', 'Worldwide', '.', 'They', 'are', 'an', 'amazing', '...', 'HTTPURL']
Error: token: .kisongochi@gmail.com
['@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: .kisongochi@gmail.com
['@USER', '@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 

Error: token: ovo.victor95@gmail.com
['I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '317', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'ovo.victor95@gmail.com', 'Thanks', 'in', 'advance', '#BoostPleaseRT']
Error: token: .kisongochi@gmail.com
['@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':folded_hands:', ':beating_heart:', 'HTTPURL']
Error: token: danni.mordell@gmail.com
['I', 'lost', 'count', 'on

Error: token: disease...bacterial/viral
['Picture', 'life', 'ending', 'with', 'a', 'stone', 'picked', 'from', 'a', 'bag', 'Each', 'stone', 'has', 'different', 'causes', 'of', 'death', 'Heart', 'attack', '...', 'diabetes', '...', 'pneumonia', '...', 'CANCER', '..', 'accident', 'Lung', 'disease...bacterial/viral', 'infections', '..', 'We', 'all', 'will', 'someday', '‘', 'pull', 'out', "'", 'an', 'end', 'to', 'our', 'life', 'Sadly', 'Covid', 'is', 'an', 'added', 'stone', 'for', 'now']
Error: token: danni.mordell@gmail.com
['@USER', 'I', 'lost', 'count', 'on', 'how', 'many', 'times', 'I', "'ve", 'tweeted', 'and', 'retweeted', '.', 'My', 'mom', 'who', 'is', 'diabetic', 'needs', 'her', 'insulin', 'and', 'has', "n't", 'had', 'a', 'meal', 'in', 'days', '.', 'I', "'m", 'really', 'still', 'short', 'on', 'food', 'ane', 'remainder', 'of', 'rent', '.', 'Pleaseeeee', 'see', 'my', 'cry', 'for', 'help', '.', 'PayPal', 'danni.mordell@gmail.com']
Error: token: nazierabegum@yahoo.co.uk
['@USER', 'Shilpa'

Error: token: .kisongochi@gmail.com
['@USER', '@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: Joesephine.McClard@hotmail.com
['I', "'m", 'in', 'REAL', 'TROUBLE', 'I', "can't", 'afford', 'insulin', '!', '!', '!', 'ANY', 'DONATION', 'GETS', 'ALL', 'MY', 'VIDEOS', '.', 'PLEASE', 'HELP', '!', '!', '!', 'PayPal', 'Joesephine.McClard@hotmail.com', 'Cash', 'App', '$', 'HardluckJoesephine', 'Venmo', '@USER', 'Joesephine', 'McClard']
Error: token: gilbaluseno@yahoo.co.uk
['@USER', 'Amen', 'may', 'Allah', 'consider', 'all', 'my', 'prayers', ':', 'healing

Error: token: googletag.cmd.push
['I', 'have', 'been', 'living', 'with', 'diabetes', 'for', 'the', 'past', '30', 'years', 'Olusegun', 'Obasanjo', 'says', 'carelessness', 'leads', 'mostly', 'to', 'death', 'in', 'diabetes', 'patients', 'googletag.cmd.push', '(', 'function', '(', ')', '{', '...', 'HTTPURL']
Error: token: danni.mordell@gmail.com
['@USER', '@USER', 'Please', '.', 'I', "'ve", 'been', 'trying', 'for', '5', 'days', '.', 'My', 'mom', 'is', 'disabled', 'and', 'diabetic', 'who', 'needs', 'ger', 'insulin', '.', 'Im', 'short', 'on', 'rent', 'and', 'food', '.', 'This', 'is', 'my', 'last', 'try', '.', 'For', 'some', 'reason', 'im', 'going', 'unnoticed', ',', 'but', 'glad', 'you', "'re", 'helping', 'others', 'in', 'worse', 'situations', '.', 'PayPal', 'danni.mordell@gmail.com']
Error: token: y.o.with
['@USER', 'thank', 'you', 'for', 'your', 'healthcare', 'vote', 'and', 'steadfast', 'bravery', '.', 'Thank', 'you', 'from', 'my', '16', 'y.o.with', 'type', '1', 'diabetes', '...', 'hope', 

Error: token: your...diabetes/eczema/health/yourself'...It
['@USER', 'My', 'favourite', 'question', 'these', 'days', 'is', '‘', 'tell', 'me', 'what', 'you', 'are', 'doing', 'to', 'look', 'after', "your...diabetes/eczema/health/yourself'...It", "'s", 'amazing', 'how', 'it', 'changes', 'the', 'dynamic', '...', 'always', 'find', 'some', 'assets', 'to', 'build', 'on', ',', 'no', 'matter', 'how', 'small', '.', '#bestjobintheworkd']
Error: token: .kisongochi@gmail.com
['@USER', 'Good', 'Morning', '!', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: .kisongoch

Error: token: .kisongochi@gmail.com
['@USER', 'Good', 'afternoon', '!', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance', ':beating_heart:', ':folded_hands:', ':folded_hands:', ':beating_heart:']
Error: token: .kisongochi@gmail.com
['@USER', '@USER', 'I', 'am', 'desperate', 'for', 'help', '.', 'I', 'need', 'my', 'insulin', 'to', 'bring', 'my', 'blood', 'sugar', 'back', 'down', '.', 'It', "'s", '$', '217', '.', 'That', "'s", 'all', 'I', 'need', '.', 'I', "'m", 'not', 'asking', 'for', 'a', 'windfall', ',', 'just', 'a', 'little', 'help', ',', 'please', '.', 'My', 'PayPal', 'is', 'victor.wambwa', '.kisongochi@gmail.com', 'Thanks', 'in', 'advance

Error: token: 5.4.Plz
['Plz', 'Bro', 'I', 'beg', 'u', 'to', 'pray', 'for', 'my', 'husband', 'Romeo', 'Dsouza', 'he', 'is', 'suffering', 'from', 'diabetes', 'and', 'chronic', 'kidney', 'disease.his', 'creatinine', 'levels', 'which', 'has', 'to', 'be1', ',', 'is', '5.4.Plz', 'pray', 'for', 'it', 'to', 'come', 'down.For', 'God', 'nothing', 'is', 'impossible.Medical', 'science', 'can', 'fail', '~', 'HTTPURL', '#pray', '#prayer', 'HTTPURL']
Error: token: now11.9.needs
['@USER', '@USER', 'I', 'was', 'admitted', 'on', '20th', ',', 'August', ',', '2018', ',', 'suffering', 'from', 'diabetic', 'of', 'high', 'sugar', 'levels', 'then', '38.7', ',', 'now11.9.needs', 'money', 'for', 'upkeep', 'welfare', '&', 'house', 'rent', '+', 'hospital', 'bills']
Error: token: danni.mordell@gmail.com
['@USER', '@USER', 'I', 'lost', 'count', 'on', 'how', 'many', 'times', 'I', "'ve", 'tweeted', 'and', 'retweeted', '.', 'My', 'mom', 'who', 'is', 'diabetic', 'needs', 'her', 'insulin', 'and', 'has', "n't", 'had', 'a'

562013
tweets to sentences: 1580990
['No need to medicine contenue .'
 '@USER " That does n\'t mean we should take care of the person who sits at home , eats poorly and gets diabetes .'
 '" Mick Mulvaney .' 'Republican .'
 '@USER Dear lord that rack is sepeeated by type 1 and type 2 diabetes .'
 'Stay away my friend .'
 '@USER i have kids and i want to spend time with them desperately .'
 'i have some diabetes amputations but i need help could u please :revolving_hearts:'
 '@USER @USER @USER Why should everybody have to pay 40 million with diabetes because they made poor health choices ?'
 'He goes into this story about how he beat diabetes and lost 36lbs from being " on the vinegar "'
 '@USER 2 quit my job & wrk fulltime takin care of my mom with Rheumatoid Arthritis & Diabetes !'
 "Ca n't do that with my job !" "I 'm all she has & I :red_heart: ️her"
 "@USER Having to cut back on carbs , my eye exam found evidence of diabetes damage and I 'm between insurance so no proper doctor yet 

In [7]:
######### Exclude questions and sentences with less than 5 words 
# and sentences without diabetes related keyword #################


trainingData = sentences[sentences.str.split(" ").str.len() > 5] # keep only sentence with more than 3 tokens
trainingData = trainingData[~trainingData.str.endswith("?")]
trainingData = trainingData[trainingData.str.contains("|".join(diabetes_keywords))]

print("N sentences with > 5 words & no question & all with diabetes keyword:", trainingData.shape)

text = trainingData.values.tolist()


N sentences with > 5 words & no question & all with diabetes keyword: (482583,)


In [13]:
dd = pd.DataFrame({"text":text})
dd["tokenized"] = dd.text.str.split(" ")
for i, row in dd[20:40].iterrows():
    print(row["text"])
    print(row["tokenized"])
    print()

@USER Dear Noelia , Im one of the mewgulf 's victim that lead me to sudden heart attack and diabetic by watching too much of their sweetness .
['@USER', 'Dear', 'Noelia', ',', 'Im', 'one', 'of', 'the', 'mewgulf', "'s", 'victim', 'that', 'lead', 'me', 'to', 'sudden', 'heart', 'attack', 'and', 'diabetic', 'by', 'watching', 'too', 'much', 'of', 'their', 'sweetness', '.']

suggest me a best way to cure fast #IndiaGateSweetTalk @USER My father suffering from diabetes !
['suggest', 'me', 'a', 'best', 'way', 'to', 'cure', 'fast', '#IndiaGateSweetTalk', '@USER', 'My', 'father', 'suffering', 'from', 'diabetes', '!']

Yeah catch my ass in Canada once I 'm off parent 's insurance bc I definitely canNOT afford insulin on my own .
['Yeah', 'catch', 'my', 'ass', 'in', 'Canada', 'once', 'I', "'m", 'off', 'parent', "'s", 'insurance', 'bc', 'I', 'definitely', 'canNOT', 'afford', 'insulin', 'on', 'my', 'own', '.']

@USER @USER She 'll love it until they decide insulin is too expensive to provide to cert

In [8]:
test_dataset = TweetDataSet(text, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Define Trainer and predict

# set inference arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
#    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    seed=0,
#    local_rank = rank
)

# we only use Trainer for inference
trainer = Trainer(model=model, args=training_args)
print("build trainer on device:", training_args.device, "with n gpus:", training_args.n_gpu)

logits = trainer.predict(test_dataset)

build trainer on device: cpu with n gpus: 0


In [10]:
predictions = torch.argmax(torch.Tensor(logits.predictions),dim=1).flatten()
probas = torch.softmax(torch.Tensor(logits.predictions), dim = -1)[...,-1:].to('cpu').numpy().squeeze()


In [11]:
probas = pd.Series(probas)
predictions = pd.Series(predictions)


In [12]:
probas.head()

0    0.414361
1    0.704896
2    0.718448
3    0.623301
4    0.588045
dtype: float32

In [13]:
predictions.head()

0    0
1    1
2    1
3    1
4    1
dtype: int64

In [14]:

causalDF = pd.DataFrame({"text":text, "causal_predictions": predictions, "proba":probas})
# causalDF = causalDF[causalDF["causal_predictions"] == 1]
print("causal sentences:", causalDF.shape[0])
causalDF.head()

causal sentences: 867


Unnamed: 0,text,causal_predictions,proba
0,Why did the doctor say they has type 2 diabete...,0,0.414361
1,When my kid ' s struggling for her insulin for...,1,0.704896
2,"@USER Thanks man , she ' s got this bad flu bu...",1,0.718448
3,If we can't laugh a little about the ups and d...,1,0.623301
4,That ' s the premise behind a new dark comedy ...,1,0.588045


In [None]:
causalDF.to_csv(save_path, sep=";")

### Sort subset of tweets by their prediction probabilities and print some examples

In [10]:
#causalDF = causalDF[0:100].sort_values(by=["proba"], ascending=[False])
#causalDF.head()

In [11]:
#for i, row in causalDF.iterrows():
#    print(row["text"], row["proba"])
#    print()

In [15]:
sent = pd.read_csv("/home/adrian/Downloads/causal_sentences_predictions_part_0.csv")
for i, row in sent[0:20].iterrows():
    print()
    print(row["text"])
    print(row["causal_predictions"])


Hypo in the Boticelli rooms at the @USER and one of the staff members offered me a sweet cause he knew my pain #type1diabetes
1

Life & diabetes in a jar altogether !
0

It 's always worrying as a diabetic when your foot starts to hurt .
0

Mmm I love bacon and hardly any carbs for breakfast :raising_hands: :light_skin_tone: :smiling_face_with_sunglasses: #breakfastoftheday #diabetes #diabetic ...
1

* frantically applies for jobs so that i can afford to take care of my diabetic cat *
0

I 'm type 1 diabetic not caused by eating poorly , I 'm 150 lbs wet .
0

@USER " That does n't mean we should take care of the person who sits at home , eats poorly and gets diabetes .
1

@USER Dear lord that rack is sepeeated by type 1 and type 2 diabetes .
1

i have some diabetes amputations but i need help could u please :revolving_hearts:
0

He goes into this story about how he beat diabetes and lost 36lbs from being " on the vinegar "
1

@USER Having to cut back on carbs , my eye exam found evide

In [17]:
causes = pd.read_csv("/home/adrian/Downloads/cause_effect_predictions_part_0.csv")
causes.head()
"""
for i, row in causes[0:20].iterrows():
    print()
    print(row["text"])
    print(row["causal_predictions"])"""

ParserError: Error tokenizing data. C error: Expected 47 fields in line 4, saw 49


### Check predictions of causal sentence classifier 

In [15]:
dataPath = "data/Causality_tweets_data.xlsx"
data = pd.read_excel(dataPath, sheet_name="round4").sample(n=1000)
print(data["Causal association"].value_counts())
data.head()

0.0    861
1.0    139
Name: Causal association, dtype: int64


Unnamed: 0.1,Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,Unnamed: 6
648,134.0,"@USER @USER while this is very funny , insulin...",,diabetics,need private insurance,1.0,0.868545
810,2628.0,diabetic retinopathy in both eyes .,,,,0.0,0.036845
1546,2785.0,sitting at a fundraiser listening to kids with...,,,,0.0,0.222685
399,925.0,Maybe everyshow can also have a character scen...,joke,,,0.0,0.80546
1197,3521.0,I think I may have type 2 diabetes .,,,,0.0,0.094678


In [20]:
trainingData = data[data["sentence"].str.split(" ").str.len() > 5] # keep only sentence with more than 3 tokens
trainingData = trainingData[~trainingData["sentence"].str.endswith("?")]
print(trainingData.shape)
text = trainingData["sentence"].values.tolist()

test_dataset = TweetDataSet(text, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


(954, 7)


In [21]:
# set inference arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
#    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    seed=0,
#    local_rank = rank
)

# we only use Trainer for inference
trainer = Trainer(model=model, args=training_args)
print("build trainer on device:", training_args.device, "with n gpus:", training_args.n_gpu)

logits = trainer.predict(test_dataset)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 954
  Batch size = 10


build trainer on device: cpu with n gpus: 0


In [24]:
predictions = torch.argmax(torch.Tensor(logits.predictions),dim=1).flatten()
probas = torch.softmax(torch.Tensor(logits.predictions), dim = -1)[...,-1:].to('cpu').numpy().squeeze()

probas = pd.Series(probas)
predictions = pd.Series(predictions)
print(predictions.shape)

(954,)


In [30]:
print(len(text))
print(len(predictions))
print(len(probas))
print(len(trainingData["Causal association"]))
print(trainingData["Causal association"].shape)

954
954
954
954
(954,)


In [43]:
causalDF = pd.DataFrame({"text":text, "causal_predictions": predictions, "proba":probas, "real":trainingData["Causal association"].values.tolist()})
causalDF['causal_predictions'] = causalDF['causal_predictions'].astype(int)
causalDF['real'] = causalDF['real'].astype(int)
for i,row in causalDF.iterrows():
    print(row["text"])
    print("true:", row["real"], "pred:", row["proba"])
    print()

@USER @USER while this is very funny , insulin is actually not covered by our public health care and diabetics like myself still need private insurance
true: 1 pred: 0.6497282981872559

diabetic retinopathy in both eyes .
true: 0 pred: 0.2772119641304016

sitting at a fundraiser listening to kids with juvenile diabetes basically beg for funding .
true: 0 pred: 0.43385839462280273

Maybe everyshow can also have a character scene dedicated to glucose alerts so that #type1diabetes can ruin every last thing : rolling_on_the_floor_laughing :
true: 0 pred: 0.6416450142860413

I think I may have type 2 diabetes .
true: 0 pred: 0.3295542895793915

That ' s why it ' s complications of covid 19 as it killed them because of the virus , it ' s like saying someone with cancer getting shot and dying should n't be labeled as being shot as a
true: 0 pred: 0.729767382144928

And napapabayaan ko sya ever since .
true: 0 pred: 0.33289599418640137

@USER Low carb is perfect for my diabetes but not for loo

true: 0 pred: 0.426230251789093

Sven here determined giving Nightingale constant attention and she can't cure that diabetes he poured .
true: 0 pred: 0.5454960465431213

5 Insulin pens costs 1800 to 2000 per month .
true: 0 pred: 0.2940419614315033

@USER I live with a bro , he has serious covid symptoms , he is also diabetic , I went to mbagathi for test , I was refad to Kenyata .
true: 0 pred: 0.7109927535057068

So many highs and lows today , I ' m definitely on the diabetic rollercoaster : roller_coaster : Hoping for a 7 mile run tomorrow as well : crossed_fingers : : light_skin_tone :
true: 1 pred: 0.7146748900413513

@USER That is some of the best diabetes reporting I have ever seen .
true: 0 pred: 0.403627872467041

@USER Like with insulin , on which there is just plain profit , since inventors gave it free to use , so zero R & D expenses , yet now costs a lot to buy .
true: 1 pred: 0.7046757936477661

American gets diabetes and will likely blame China .
true: 0 pred: 0.2671883

true: 0 pred: 0.4324653446674347

Off to an ok start this year .
true: 0 pred: 0.18769855797290802

And they have serious breakthrough symptoms .
true: 0 pred: 0.1810918003320694

hoooooooy over I just received stocks sa Insulin , 400 vials Isophane and 400 vials regular + isophane 30/70 .
true: 0 pred: 0.6815111637115479

@USER If it ' s candy & it does n't involve chocolate / caramel , for this diabetic , it ' s a waste of carbs .
true: 0 pred: 0.6554430723190308

Hmm a cpl wks ago I spoke to my GP wrt #insulin availability & he said things wd be fine , but gave me a script anyway , ( so I ' m covered until 2021 ) .
true: 0 pred: 0.6879940629005432

I ' m down to my last 2 prefilled injectors so I have about a month left .
true: 0 pred: 0.6108126640319824

This forces things like test strips to be purchased out of pocket .
true: 0 pred: 0.40433841943740845

It ' s a devastating disease affecting millions worldwide RIP Janice Nero : red_heart :
true: 0 pred: 0.4969315528869629

Never 

Now as far as diabetes I have two people in my family that are at least 20 years plus younger than me that have diabetes and this was due to their diet and the Libations that they were taking in their system
true: 1 pred: 0.7406167984008789

@USER They do this at diabetes camp .
true: 0 pred: 0.20907002687454224

The pharmacist said he put it on hold .
true: 0 pred: 0.26636525988578796

“ Got any superpowers ” “ I got type 1 and type 2 diabetes .
true: 0 pred: 0.5371349453926086

@USER Since beating my Type 2 Diabetes with exercise and using the 5 Star rating on foods to ensure I am making better choices it has been a good thing .
true: 1 pred: 0.6092765927314758

And cut it ' s leg off " Jordan
true: 0 pred: 0.27875810861587524

There ' s no way there ' s a single person with diabetes in leadership chiming in AT LEAST about the optics of this .
true: 0 pred: 0.6614813804626465

Pretty sick of insurance and diabetes , havent checked my blood sugar in 6 months , basically waiting for th

true: 0 pred: 0.7141219973564148

Regálame unas tenis de esas de diabético I ' m ur biggest fan man :p leading_face :
true: 0 pred: 0.6301742792129517

@USER You are a good dad , looking out for their best interest , saving them from cavities and diabetes ...
true: 0 pred: 0.6071726083755493

@USER My insulin pump doesnt allow me to calculate using 333g of carbohydrates , but with 225g , it tells me to give 18,5 units of insulin which it doesnt allow me to give in one sitting .
true: 0 pred: 0.6582938432693481

Shits expensive and i like to live : grinning_face_with_sweat : : grinning_face_with_sweat :
true: 0 pred: 0.4803200960159302

Our house is worthless , our pension worth less due to falling pound .
true: 0 pred: 0.37275806069374084

Three months since getting my Libre , and my HBA 1c is down from 70 to 56 !
true: 1 pred: 0.5825958847999573

nice way to fool world .. we know the quality of chinese products ...
true: 0 pred: 0.351299911737442

all the good intentions are there but