In [3]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import pandas as pd
import numpy as np 

In [None]:
train_data=pd.read_csv('task3_training.tsv',sep='\t')
valid_data=pd.read_csv('task3_validation.tsv',sep='\t')
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
train_data = train_data.dropna(subset=['type'])
train_data = train_data.groupby('tweet_id',sort=False).agg(lambda x: x.tolist())
train_data.reset_index(inplace=True)


In [None]:
train_data

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term
0,342314998904786945,"[42.0, 11.0]","[53.0, 21.0]","[ADR, ADR]","[lost vision, withdrawal]","[cymbalta, cymbalta]","[#cymbalta withdrawal has reached a peak, lost...","[10047522.0, 10048010.0]","[vision loss, withdrawal syndrome]"
1,342322703556038657,[27.0],[35.0],[ADR],[nauseous],[cipro],[i hate cipro! #antibiotic #nauseous #cf #hosp...,[10028823.0],[nauseous]
2,342349802601844737,"[109.0, 101.0, 71.0]","[118.0, 104.0, 84.0]","[ADR, ADR, ADR]","[can't cum, fat, gainin weight]","[seroquel, seroquel, seroquel]",[@luckystubbs reppin zoloft&amp;seroquel since...,"[10021574.0, 10047896.0, 10047896.0]","[inability to orgasm, weight gain, weight gain]"
3,342355917045645313,[21.0],[29.0],[ADR],[headache],[trazodone],[i've had a trazodone headache all day. i hear...,[10019211.0],[headache]
4,342359425400582144,[72.0],[83.0],[ADR],[exacerbates],[seroquel],[i've had ocd symptoms like that for a while b...,[10076326.0],[condition worsened]
...,...,...,...,...,...,...,...,...,...
1075,333697653349179393,[91.0],[116.0],[ADR],[never have another orgasm],[ziprasidone],[@verlieren thank you! it didn't even fucking ...,[10021574.0],[inability to orgasm]
1076,342344675572727811,[65.0],[69.0],[ADR],[coma],[zyprexa],[i just found out that some dude from my frien...,[10041349.0],[somnolence]
1077,348383854089871360,[72.0],[91.0],[ADR],[gain so much weight],[zyprexa],[that zyprexa really makes your vocal chords m...,[10047896.0],[weight gain]
1078,349181683293102080,[99.0],[117.0],[ADR],[increase my weight],[zyprexa],"[i'm so fine today. increasing zyprexa,my cond...",[10047898.0],[weight increase]


In [None]:
def merge(intervals):
  if len(intervals)==0:
      return []
  intervals = sorted(intervals,key = lambda x : x[0])
  current = intervals[0]
  result = []
  for i in range(len(intervals)):
      if current[0]<=intervals[i][0]<=current[1]:
          current[1] = max(current[1],intervals[i][1])
          continue
      else:
          result.append(current)
          current = intervals[i]
  result.append(current)
  return result

In [None]:
import math

TRAIN_DATA = []
for index,row in train_data.iterrows():
  entityArray = []
  for i in range(len(row.begin)):
      entityArray.append([int(row.begin[i]),int(row.end[i])])
  entityArray = merge(entityArray)
  new = []
  for item in entityArray:
    new.append((item[0],item[1],"ADR"))
  TRAIN_DATA.append((row.tweet[0],{"entities": new}))

TRAIN_DATA[0]

('#cymbalta withdrawal has reached a peak, lost vision and almost crashed my car from a brain zap. thanks a zillion #elililly #bigpharma',
 {'entities': [(11, 21, 'ADR'), (42, 53, 'ADR')]})

In [6]:
def main(model=None, n_iter=100):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

   
        
    return nlp
if __name__ == "__main__":
    nlp=main()

Created blank 'en' model
Losses {'ner': 2551.423882494275}
Losses {'ner': 1976.5295309210333}
Losses {'ner': 1916.167938397748}
Losses {'ner': 1878.6940810338715}
Losses {'ner': 1821.0647962605224}
Losses {'ner': 1815.0428501890372}
Losses {'ner': 1700.5576551601873}
Losses {'ner': 1612.264107033633}
Losses {'ner': 1478.530904763019}
Losses {'ner': 1384.2627729590351}
Losses {'ner': 1468.8899633968103}
Losses {'ner': 1345.0058643549837}
Losses {'ner': 1218.282638692754}
Losses {'ner': 1144.3202748154424}
Losses {'ner': 1247.4327336143324}
Losses {'ner': 1231.2312990445541}
Losses {'ner': 1242.2455998179666}
Losses {'ner': 1126.2987117443918}
Losses {'ner': 1150.8053288090366}
Losses {'ner': 1097.077123929836}
Losses {'ner': 1086.3744002196033}
Losses {'ner': 889.936767143298}
Losses {'ner': 884.2940022669319}
Losses {'ner': 913.1665790804162}
Losses {'ner': 881.1483728857053}
Losses {'ner': 913.8251840897135}
Losses {'ner': 925.3003273118054}
Losses {'ner': 850.7962559248349}
Losses {'

In [4]:
train_data=pd.read_csv('task3_training.tsv',sep='\t')
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
train_data = train_data.dropna(subset=['type'])
train_data.reset_index(inplace=True)




In [5]:
valid_data=pd.read_csv('task3_validation.tsv',sep='\t')
valid_data = valid_data.dropna(subset=['type'])

valid_data.reset_index(inplace=True)

In [6]:

def extracted_ADR(valid_data):
  valid_data['ADR_value']=None
  for i in range(0,len(valid_data)):
    doc=nlp(valid_data['tweet'][i])

    for ent in doc.ents:
      if ent.label_=='ADR':
        valid_data['ADR_value'][i]=ent.text
      


In [7]:
extracted_ADR(valid_data)
extracted_ADR(train_data)
train_data = train_data.dropna(subset=['meddra_term'])
train_data = train_data.dropna(subset=['ADR_value'])
valid_data = valid_data.dropna(subset=['meddra_term'])
valid_data = valid_data.dropna(subset=['ADR_value'])

NameError: ignored

In [11]:

X_train = pd.DataFrame(train_data['ADR_value'])
Y_train = pd.DataFrame(train_data['meddra_term'])
X_test = pd.DataFrame(valid_data['ADR_value'])
Y_test = pd.DataFrame(valid_data['meddra_term'])

In [12]:
Y_train

Unnamed: 0,meddra_term
0,vision loss
1,withdrawal syndrome
2,nauseous
3,inability to orgasm
4,weight gain
...,...
1459,flat affect
1460,weight gain
1461,weight gain
1462,weight increase


In [13]:
print(Y_test.meddra_term.unique)

<bound method Series.unique of 4                         death
5                         death
6                 tendon injury
7                bizarre dreams
8                tendon rupture
                 ...           
360             hypochondriasis
361         withdrawal syndrome
362                    low mood
363                 weight gain
364    electric shock sensation
Name: meddra_term, Length: 295, dtype: object>


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

logreg = Pipeline([('vect', CountVectorizer()),
               ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=10000)),
              ])
logreg.fit(X_train["ADR_value"], Y_train["meddra_term"])

Pipeline(steps=[('vect', CountVectorizer()),
                ('clf',
                 LogisticRegression(C=100000.0, max_iter=10000, n_jobs=1))])

In [15]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
y_pred = logreg.predict(X_test["ADR_value"])
print(classification_report(Y_test["meddra_term"], y_pred))
print (accuracy_score(Y_test['meddra_term'], y_pred))

                                  precision    recall  f1-score   support

                   Schizophrenia       0.00      0.00      0.00         1
                 abnormal dreams       0.00      0.00      0.00         0
                            ache       0.00      0.00      0.00         2
                       addiction       0.00      0.00      0.00         2
               allergic reaction       0.33      0.33      0.33         3
              anaphylactic shock       0.00      0.00      0.00         0
                         anxiety       0.00      0.00      0.00         1
                 appetite absent       0.00      0.00      0.00         0
                   appetite lost       0.00      0.00      0.00         0
             appetite suppressed       0.00      0.00      0.00         2
              attention impaired       0.00      0.00      0.00         1
         auditory hallucinations       0.00      0.00      0.00         2
                       back ache     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
