# Environment Setup

In [1]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

import torch
import os, random
import numpy as np
from tqdm import tqdm

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

seed = 10

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

import pandas as pd

from transformers import AutoTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1a. ADE Detection

### Load ADE detection model

In [3]:
ade_model = torch.load("Adaptive_Learner/whole_model.pt")
ade_tokenizer = AutoTokenizer.from_pretrained("Adaptive_Learner")

### Load Validation Data

In [4]:
tweets = pd.read_csv("data/smm4h_22/Task1-DataForParticipants/Task1a/valid/tweets.tsv", sep="\t", header=None)
labels = pd.read_csv("data/smm4h_22/Task1-DataForParticipants/Task1a/valid/class.tsv", sep="\t", header=None)

In [5]:
valid_tweets = list(tweets[1])
valid_tweet_ids = list(tweets[0])

ade_tweet_ids = list(labels[0])

labels = list()

for ids in valid_tweet_ids:
    
    if ids in ade_tweet_ids:
        labels.append("ADE")
    else:
        labels.append("NoADE")

        
tweets["target"] = labels

In [6]:
tweets.columns = ["tweet_id", "tweet", "target"]

In [7]:
tweets

Unnamed: 0,tweet_id,tweet,target
0,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE
1,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE
2,SMM4H20229Aha6m4XERqYdFWf,06.30 day 14 Rivaroxaban diary. Thanks to para...,ADE
3,SMM4H2022UAvDTQWOIacvBkzp,rt @USER_______: my philly dr prescribed me tr...,ADE
4,SMM4H2022qNHntuJnkevkahGr,ciprofloxacin: how do you expect to sleep when...,ADE
...,...,...,...
904,SMM4H2022CgATBD494ehjw7Aj,They've used sildenafil to treat hypertension....,NoADE
905,SMM4H2022bRdYe0JKrY3chH6g,@USER______ thx! the lamictal was a lifesaver ...,NoADE
906,SMM4H2022BNvpfxDmxQy7BPsu,Researchers to study effectiveness of atorvast...,NoADE
907,SMM4H20224d7PvtBV42vXzUgw,Oral #bupe Buprenorphine Hemiadipate rapid dos...,NoADE


### Prediction

In [8]:
valid_tweets = tweets["tweet"]

label_list = ["ADE", "NoADE"]

predicted_labels = list()

for valid_tweet in valid_tweets:
    inputs = ade_tokenizer(valid_tweet, return_tensors="pt")
    inputs = inputs.to("cuda")
    with torch.no_grad():
        score = ade_model(**inputs).to("cpu").tolist()[0]
        index = score.index(max(score))
        final_label = label_list[index]
        predicted_labels.append(final_label)
        

### Performance Report

In [9]:
from sklearn.metrics import classification_report

In [10]:
print(classification_report(list(tweets["target"]), predicted_labels, target_names=label_list))

              precision    recall  f1-score   support

         ADE       0.71      0.72      0.72        65
       NoADE       0.98      0.98      0.98       844

    accuracy                           0.96       909
   macro avg       0.85      0.85      0.85       909
weighted avg       0.96      0.96      0.96       909



# 1b. Span Extraction

### Load Validation data

In [11]:
tweets = pd.read_csv('data/smm4h_22/Task1-DataForParticipants/Task1b/valid/tweets.tsv', sep="\t", header=None)
labels = pd.read_csv('data/smm4h_22/Task1-DataForParticipants/Task1b/valid/spans.tsv', sep="\t", header=None)

In [12]:
span_df = pd.merge(tweets, labels, on=0)

In [13]:
span_df.columns = ["tweet_id", "tweet", "label", "start", "end", "span"]

In [14]:
span_df

Unnamed: 0,tweet_id,tweet,label,start,end,span
0,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,119,125,nerves
1,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,126,139,muscle spasms
2,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,61,68,gaining
3,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,91,110,gain like 50 pounds
4,SMM4H20229Aha6m4XERqYdFWf,06.30 day 14 Rivaroxaban diary. Thanks to para...,ADE,118,134,frontal headache
...,...,...,...,...,...,...
82,SMM4H2022cjwbGQbnkpVjjJzR,"Fucking Vyvanse, giving me cotton mouth. UGH.",ADE,27,39,cotton mouth
83,SMM4H2022lX1A8RVuySkDzAB2,between the fucking redbull and vyvanse i popp...,ADE,84,104,couldn't fall asleep
84,SMM4H2022qZC2BPG2BW7UC175,"rt @USER_______: vyvanse, commonly known as oc...",ADE,45,48,OCD
85,SMM4H20226b5WyZPTAJ7qL3dE,rt @USER_______: @USE when are you going to do...,ADE,63,71,addicted


In [15]:
ade_tweets = list(span_df["tweet"])

### Load QA model

In [16]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)

2022-07-30 08:22:24.202708: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-07-30 08:22:26.397437: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-30 08:22:26.404709: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-30 08:22:26.406246: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-30 08:22:26.408172: I tensorflow/core/

### Prediction

In [17]:
question = 'What are the side effects?'

qa_spans = list()
span_starts = list()
span_ends = list()
for i in range(0, len(ade_tweets)):
    QA_input = {
    'question': question,
    'context': ade_tweets[i]
    }
    result = nlp(QA_input)
    span = result["answer"]
    start = result["start"]
    end = result["end"]
    qa_spans.append(span)
    span_starts.append(start)
    span_ends.append(end)

In [18]:
task_1b_df = span_df[["tweet_id", "tweet", "label"]]

In [19]:
task_1b_df["span"] = qa_spans
task_1b_df["start"] = span_starts
task_1b_df["end"] = span_ends

In [20]:
task_1b_df

Unnamed: 0,tweet_id,tweet,label,span,start,end
0,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,nerves/muscle spasms,118,138
1,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,nerves/muscle spasms,118,138
2,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,now i have to lose it,118,139
3,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,now i have to lose it,118,139
4,SMM4H20229Aha6m4XERqYdFWf,06.30 day 14 Rivaroxaban diary. Thanks to para...,ADE,"frontal headache, 1/2",118,139
...,...,...,...,...,...,...
82,SMM4H2022cjwbGQbnkpVjjJzR,"Fucking Vyvanse, giving me cotton mouth. UGH.",ADE,cotton mouth,27,39
83,SMM4H2022lX1A8RVuySkDzAB2,between the fucking redbull and vyvanse i popp...,ADE,couldn't fall asleep for the life of me.,84,124
84,SMM4H2022qZC2BPG2BW7UC175,"rt @USER_______: vyvanse, commonly known as oc...",ADE,"vyvanse, commonly known as ocd in a pill.",17,58
85,SMM4H20226b5WyZPTAJ7qL3dE,rt @USER_______: @USE when are you going to do...,ADE,addicted,63,71


### Performance Report

In [21]:
predicted_label = list(task_1b_df["span"])

original_label = list(span_df["span"])

In [22]:
labels = list()
for org, pre in zip(original_label, predicted_label):
    
    if org in pre:
        labels.append(str(1))
    else:
        labels.append(str(0))

In [23]:
gold_labels = ['1']*len(labels)

In [24]:

label_list = ['0', '1']
print(classification_report(gold_labels, labels, target_names=label_list))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.63      0.77        87

    accuracy                           0.63        87
   macro avg       0.50      0.32      0.39        87
weighted avg       1.00      0.63      0.77        87



# 1.c Normalization

In [25]:
from fuzzywuzzy import process
import pickle
import re

### Load Validation data

In [26]:
tweets = pd.read_csv('data/smm4h_22/Task1-DataForParticipants/Task1c/valid/tweets.tsv', sep="\t", header=None)
labels = pd.read_csv('data/smm4h_22/Task1-DataForParticipants/Task1c/valid/spans_norm.tsv', sep="\t", header=None)

In [27]:
norm_df = pd.merge(tweets, labels, on=0)

In [28]:
norm_df.columns = ["tweet_id", "tweets", "label", "start", "end", "span", "code"]

In [29]:
norm_df

Unnamed: 0,tweet_id,tweets,label,start,end,span,code
0,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,119,125,nerves,10029177
1,SMM4H2022ykI8vN7jZYnV57AM,@USER_________ i found the humira to fix all m...,ADE,126,139,muscle spasms,10028334
2,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,61,68,gaining,10047896
3,SMM4H2022uCZV2SRsCe4vzjFm,@USER__________ have to go to a doc now to see...,ADE,91,110,gain like 50 pounds,10047896
4,SMM4H20229Aha6m4XERqYdFWf,06.30 day 14 Rivaroxaban diary. Thanks to para...,ADE,118,134,frontal headache,10019211
...,...,...,...,...,...,...,...
82,SMM4H2022cjwbGQbnkpVjjJzR,"Fucking Vyvanse, giving me cotton mouth. UGH.",ADE,27,39,cotton mouth,10013781
83,SMM4H2022lX1A8RVuySkDzAB2,between the fucking redbull and vyvanse i popp...,ADE,84,104,couldn't fall asleep,10044698
84,SMM4H2022qZC2BPG2BW7UC175,"rt @USER_______: vyvanse, commonly known as oc...",ADE,45,48,OCD,10029898
85,SMM4H20226b5WyZPTAJ7qL3dE,rt @USER_______: @USE when are you going to do...,ADE,63,71,addicted,10012336


### MedDRA dictionary

In [30]:

read_file = open('VectorizedMedDRA','rb')
object_file = pickle.load(read_file)
read_file.close()

meddra_terms = object_file["med_terms"]

### Corpus dictionary

In [31]:
corpus_df = pd.read_csv("data/smm4h_22/Task1-DataForParticipants/Task1c/train/train_spans_norm.tsv", sep="\t", header=None)

In [32]:
med_terms = list(corpus_df[4]) 
med_codes = list(corpus_df[5])

### Mapping to llt.asc

In [33]:
final_med_codes = list()

for i, tweet in enumerate(list(norm_df["tweets"])):
    
    tweet = tweet.replace("@USER", " ")
    
    tweet = re.sub(r"[^a-zA-Z0-9 ]", " ", tweet)
    
    tweet = tweet.replace("_", " ")
    
    while "  " in  tweet:
        tweet = tweet.replace("  ", " ")
    
    QA_input = {
    'question': 'What are the side effects?',
    'context': tweet
    }
    result = nlp(QA_input)
    span = result["answer"]
    
    str2Match = span
    strOptions = meddra_terms
    meddra_highest = process.extractOne(str2Match,strOptions)
    
    
    str2Match = span
    strOptions = med_terms
    sm_highest = process.extractOne(str2Match,strOptions)
    
    # selecting based on high score
    if meddra_highest[1] >= sm_highest[1]:
        med_code = object_file["med_ids"][meddra_terms.index(meddra_highest[0])]
    else:
        med_code = med_codes[med_terms.index(sm_highest[0])]
        
    final_med_codes.append(med_code)

### Performance Report

In [34]:
gold_codes = [str(item) for item in list(norm_df["code"])]
predicted_codes = [str(item) for item in final_med_codes]

In [35]:
print(classification_report(gold_codes, predicted_codes))

              precision    recall  f1-score   support

    10000033       0.00      0.00      0.00         0
    10000140       0.00      0.00      0.00         0
    10000194       0.00      0.00      0.00         0
    10000283       0.00      0.00      0.00         0
    10000685       0.00      0.00      0.00         0
    10001125       0.00      0.00      0.00         1
    10001154       0.00      0.00      0.00         0
    10001803       0.00      0.00      0.00         0
    10002855       0.50      1.00      0.67         1
    10003028       0.00      0.00      0.00         0
    10003288       0.00      0.00      0.00         0
    10003543       0.00      0.00      0.00         0
    10003731       0.00      0.00      0.00         0
    10003988       1.00      1.00      1.00         1
    10004063       0.00      0.00      0.00         1
    10004716       1.00      1.00      1.00         1
    10004969       0.00      0.00      0.00         0
    10005613       0.00    