In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

Total count: 5456
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è THIS ‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è My wife has type ...,‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è THIS ‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


### Interrater-reliabilty measure

In [2]:
from sklearn.metrics import cohen_kappa_score

charline = data[data["Charline association0=no;1=yes"].notnull()]
coder1 = charline["Causal association"].values
coder2 = charline["Charline association0=no;1=yes"]
score = cohen_kappa_score(coder1,coder2)
#print('Cohen\'s Kappa:',score)

### Data Preprocessing

In [3]:
data["Causal association"].value_counts()

0.0    3720
1.0    1280
Name: Causal association, dtype: int64

In [6]:
trainingData = create_training_data(data, min_words_in_sentences=3)
trainingData.head()

Unnamed: 0,tweet,Causal association,BIOtags
0,"tonight , I learned my older girl will back he...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è THIS ‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è My wife has type ...,0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,I'm a trans woman .,0.0,"[O, O, O, O, O, O]"
3,"Both of us could use a world where "" brave and...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"Make a world where people can just be , withou...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [77]:
for i,row in trainingData.sample(n=20).iterrows():
    print("\n")
    print(row["tweet"])
    print(row["BIOtags"])



Good news .
['O', 'O', 'O']


Think about it : they have no interest in whether or not you can get your insulin , but they're gonna sure as hell make certain you can't get your contraceptives .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


‚Äú I'm not a doctor but you have scoliosis or diabetes ‚Äù - USER okay thank you üòÇ
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


USER USER USER If he cured cancer with his own blood they'd march out every diabetic they could find and scream " why do you hate diabetics !
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Not the way pharma plays .
['O', 'O', 'O', 'O', 'O', 'O']


‚Äù Maybe mine Is invisible ü§∑ ü§∑ ‚Äç ‚ôÄ Ô∏è üò∂
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:
trainingData["Causal association"].value_counts()

0.0    7607
1.0    1019
Name: Causal association, dtype: int64

### Training

In [8]:
text = trainingData["tweet"].map(normalizeTweet).values.tolist()
labels = trainingData["Causal association"].values.tolist()
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)
print("Train: {}".format(len(train_texts)))
print("Val: {}".format(len(val_texts)))
print("Test: {}".format(len(test_texts)))


Train: 5520
Val: 1380
Test: 1726


In [54]:
#train, validate, test = np.split(trainingData.sample(frac=1, random_state=42)
#                                ,[int(.65*len(trainingData)), int(.8*len(trainingData))])
train, validate, test = np.split(trainingData.sample(n=10).sample(frac=1, random_state=42)
                                ,[int(.65*len(trainingData)), int(.8*len(trainingData))])
print("Train:", train.shape)
print("Validate:", validate.shape)
print("Test:", test.shape)

Train: (10, 3)
Validate: (0, 3)
Test: (0, 3)


In [58]:
# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["0", "B-C", "I-C", "B-E", "I-E"])}
        self.id2tag = {id:tag for tag,id in tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.float)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: bioTags_to_ids[bioTags], self.bio_tags[idx])), dtype=torch.int)
        }

    def __len__(self):
        return len(self.labels)

    def extend_tags(tokens_old, tags_old):
        tags = ["0"] # add for start token <CLS>
        for token_old, tag in zip(tokens_old, labels_old):
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if i == 0: # first sub token
                    tags.append(tag2id[tag])
                else:
                
        tags.append("0") # 0 for 
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["tweet"].map(normalizeTweet).values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["BIOtags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(validate["tweet"].map(normalizeTweet).values.tolist()
                           , validate["Causal association"].values.tolist()
                           , validate["BIOtags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["tweet"].map(normalizeTweet).values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["BIOtags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


10
0
0


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [89]:
#tags = train_dataset.bio_tags
tag = ['B-C', 'B-C', 'B-E', 'O', "O"]
text = "#t1diabetic #diabetes #lowbloodsugars HTTPURL gunships"
#text = train_dataset.text 
inputs = tokenizer(text, padding=True, truncation=True, return_token_type_ids=True)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
print(text)
print(tag)
print(ids)
print(token_type_ids)
print(mask)
print(tokenizer.decode(ids))
print(tokenizer.convert_ids_to_tokens(ids))
print(tokenizer.special_tokens_map)
print(tokenizer.special_tokens_map_extended)
dir(tokenizer)

#t1diabetic #diabetes #lowbloodsugars HTTPURL gunships
['B-C', 'B-C', 'B-E', 'O', 'O']
[0, 2733, 1032, 49488, 85, 15015, 55048, 17535, 23746, 256, 10, 6302, 4828, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<s> #t1diabetic #diabetes #lowbloodsugars HTTPURL gunships </s>
['<s>', '#t@@', '1@@', 'diabetic', '#@@', 'diabetes', '#low@@', 'blood@@', 'sugar@@', 's', 'HTTPURL', 'gun@@', 'ships', '</s>']
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_bos_token',
 '_cls_token',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_from_pretrained',
 '_get_padding_truncation_strategies',
 '_mask_token',
 '_pad',
 '_pad_token',
 '_pad_token_type_id',
 '_push_to_hub',
 '_save_pretrained',
 '_sep_token',
 '_tokenize',
 '_unk_token',
 'add_from_

In [None]:
# 1) Trainer 
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted') #binary
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalNER(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalNER, self).__init__()
        self.num_labels = 5 # B-C, I-C, B-E, I-E, O
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
#        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_2 = self.dropout(output)
        output_3 = self.linear(output_2)
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        logit = self.softmax(output_5)
        return logit


## Model parameters
batchsize_train = 16
lr = 5e-5
adam_eps = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

# Store our loss and learning rate for plotting
train_loss_set = []
learning_rate = []


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalNER()
model.to(device)

# fine-tune only the task-specific parameters -> Vivek? 
for param in model.bert.parameters():
    param.requires_grad = False
    



optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
loss_fn = CrossEntropyLoss()


In [57]:
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    tr_accuracy, tr_mcc_accuracy, nb_tr_steps = 0, 0, 0 # Tracking variables
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        #labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################################################                  
        loss = loss_fn(logits, bio_tags)
        print("loss:", loss)
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
        
                          
        ################## Training Performance Measures ##########
        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
#        label_ids = labels.to('cpu').numpy()
        label_ids = bio_tags.to('cpu').numpy()
        
        pred_flat = np.argmax(logits, axis=1).flatten() # convert logits to list of predicted labels
        labels_flat = label_ids.flatten()
                          
        tr_accuracy += accuracy_score(labels_flat, pred_flat)
        tr_mcc_accuracy += matthews_corrcoef(labels_flat, pred_flat)                          
                
        metrics = compute_metrics(pred_flat, labels_flat)
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining acc: {np.mean(train_acc)}')
    print(F'\n\tTraining MCC acc: {tr_mcc_accuracy / nb_tr_steps}')
    print(F'\n\tTraining prec: {np.mean(train_prec)}')
    print(F'\n\tTraining rec: {np.mean(train_rec)}')
    print(F'\n\tTraining f1: {np.mean(train_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    eval_accuracy, eval_mcc_accuracy, nb_eval_steps = 0, 0, 0 # Tracking variables
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []

    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits = model(**{"inputs_ids":b_inputs_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
                              
        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()
        

        pred_flat = np.argmax(logits, axis=1).flatten() # convert logits to list of predicted labels
        labels_flat = label_ids.flatten()
        
        eval_accuracy += accuracy_score(labels_flat, pred_flat)
        eval_mcc_accuracy += matthews_corrcoef(labels_flat, pred_flat)
        
        metrics = compute_metrics(pred_flat, labels_flat)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])
                              
        nb_eval_steps += 1
        
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation MCC acc: {eval_mcc_accuracy / nb_eval_steps}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')
    


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]






RuntimeError: stack expects each tensor to be equal size, but got [16] at entry 0 and [14] at entry 1

### Evaluation on the test dataset

In [None]:

############ test eval metrics ######################
test_mcc_accuracy, nb_test_steps = 0, 0, 0 # Tracking variables
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits = model(**{"inputs_ids"=b_inputs_ids, "attention_mask"=b_input_mask, "token_type_ids"=b_token_type_ids) # forward pass, calculates logit predictions

    loss = loss_fn(logits, b_labels)
    test_loss.append(loss.item())

    # move logits and labels to CPU
    logits = logits.detach().to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()


    pred_flat = np.argmax(logits, axis=1).flatten() # convert logits to list of predicted labels
    labels_flat = label_ids.flatten()

    test_mcc_accuracy += matthews_corrcoef(labels_flat, pred_flat)
    metrics = compute_metrics(pred_flat, labels_flat)
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    nb_test_steps += 1

print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest acc: {np.mean(test_acc)}')
print(F'\n\tTest MCC acc: {eval_mcc_accuracy / nb_eval_steps}')
print(F'\n\tTest prec: {np.mean(test_prec)}')
print(F'\n\tTest rec: {np.mean(test_rec)}')
print(F'\n\tTest f1: {np.mean(test_f1)}')


In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# AutoModelForSequenceClassification adds a fully connected layer after BERT
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")


trainer = Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()



In [37]:
import transformers

In [41]:
model.parameters

<bound method Module.parameters of CausalityBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

### Save model

In [None]:
torch.save(model.state_dict(), "finetuned-35-epochs.pth")

### Load model locally

In [None]:
device = torch.device("cuda", if torch.cuda.is_available() else "cpu")
model = CausalityBERT()
model.load_state_dict(torch.load("finetuned-35-epochs.pth"))
model.to(device)
model.eval()

# Questions to Vivek?


In [None]:
If there is only one cause and no effect ; or only one effect and no cause => ignore ? 

In [9]:
data.head()

Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è THIS ‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è My wife has type ...,‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è THIS ‚¨á Ô∏è ‚¨á Ô∏è ‚¨á Ô∏è My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


In [11]:
data[0:3].to_csv("/home/adrian/Downloads/tesst.csv")

In [18]:
data = pd.read_csv("/home/adrian/Downloads/Adrian.csv", sep=";")
data.head()

Unnamed: 0,Date,User ID,User Name,User Role,Node Id,Node Title,Channel,Source,Time spent\
0,"jeu, 02/25/2021 - 09:03",2416\'a0\'bb;\'a0\'bbalbert@hotmail.it,Standard,1388,Passione / Romeo Castellucci,ClassicAll,channel,1780\,
1,"jeu, 02/25/2021 - 00:34",2394\'a0\'bb;\'a0\'bbjohn@gmx.fr,Member,1094,Trag\'e9die / Olivier Dubois,ClassicAll,channel,560\,
2,"mer, 02/24/2021 - 21:19",0,78.233.236.180,Anonyme,1374,Tchaikovsky / La Dame de Pique,ClassicAll,live,70\
3,"mer, 02/24/2021 - 21:18",0,78.233.236.180,Anonyme,1374,Tchaikovsky / La Dame de Pique,ClassicAll,live,50\
4,"mer, 02/24/2021 - 16:20",2231,didier@skynet.be,Standard,1530,Le Livre de Dina / Herbj\'f8rg Wassmo,Dramateek,channel,2390}


In [29]:
!python --version

Python 3.8.8
