In [1]:
#librairies needed for Step1
import pandas as pd
import glob

#librairies needed for step 3
import spacy

import nltk

In [2]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


## Step 1

During this step, I'm going to read the .ann files and then to create a dataframe which will contain two columns :

- The ID that appears first on each line ( T1, T2, R1, R2, etc. )
- The rest of the annotation text which is separated from the ID by a single TAB character

In [3]:
path = '/drive/My Drive/corpus' # path of annotated files
all_files1 = glob.glob(path + "/*.ann")

In [4]:
# create empty list to hold dataframes from files found
dfs = []
# for each file in the path above ending .ann
for file in all_files1:
    #open the file
    df = pd.read_csv(file, sep='^([^\s]*)\s', engine='python', header=None).drop(0, axis=1)
    #add this new (temp during the looping) frame to the end of the list
    dfs.append(df)

At this point we have a list of frames with each list item as one .ann file.  

For example : [annFile1, annFile2, etc.]


In [5]:
df

Unnamed: 0,1,2
0,T1,RAREDISEASE 0 35\tSubacute sclerosing panencep...
1,T2,RAREDISEASE 37 41\tSSPE
2,T3,DISEASE 60 81\tneurological disorder
3,T4,RAREDISEASE 126 138\tencephalitis
4,T5,SIGN 126 138\tencephalitis
5,T6,ANAPHOR 141 152\tThe disease
6,T7,DISEASE 192 199\tmeasles
7,T8,DISEASE 249 256\tmeasles
8,T9,RAREDISEASE 264 268\tSSPE
9,T10,SIGN 362 373\tmemory loss


In [6]:
#handle a list that is empty
if len(dfs) == 0:
    print('No files found.')
    #create a dummy frame
    df = pd.DataFrame()
#or have only one item/frame and get it out
elif len(dfs) == 1:
    df = dfs[0]
#or concatenate more than one frame together
else: 
    df = pd.concat(dfs, ignore_index=True)
    df = df.reset_index(drop=True)


In [7]:
#check what we've got
print(df.head(2))

    1                                               2
0  T1  RAREDISEASE 4 32\tautoimmune hemolytic anemias
1  T2                          SIGN 96 105\themolysis


In [8]:
i=1 # index of file
df['Text File']=''
for r in range (df.shape[0]):
    if df[1][r]=='T1':
        file="File "+str(i)
        df['Text File'][r]=file
        i=i+1
    else:
        df['Text File'][r]=file 

In [9]:
df.head(50)

Unnamed: 0,1,2,Text File
0,T1,RAREDISEASE 4 32\tautoimmune hemolytic anemias,File 1
1,T2,SIGN 96 105\themolysis,File 1
2,T3,RAREDISEASE 170 196\tAcquired hemolytic anemias,File 1
3,T4,RAREDISEASE 224 263\tIdiopathic acquired autoi...,File 1
4,T5,ANAPHOR 555 574\tthis type of anemia,File 1
5,T6,DISEASE 568 574\tanemia,File 1
6,T7,RAREDISEASE 712 739\tautoimmune hemolytic anemia,File 1
7,T8,ANAPHOR 768 770\tit,File 1
8,T9,RAREDISEASE 846 876\tCold antibody hemolytic a...,File 1
9,T10,RAREDISEASE 920 950\twarm antibody hemolytic a...,File 1


In [13]:
df[df['Text File']=='File 3'][1].tolist()

['T1',
 'T2',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T8',
 'T9',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'T15',
 'T16',
 'T17',
 'T18',
 'T19',
 'R1',
 'R2',
 'R3',
 'R4',
 'R5',
 'R6',
 'R7',
 'R8',
 'R9',
 'R10',
 'R11',
 'R12',
 'R13',
 'R14']

## Step 2

During the second step, I'm going to keep the relation labels and get rid of other types of annotation as Attributes (A1, A2..) and Events (E1, E2 ..) if they exist. 

In [10]:
R=[] #list of labels
entity1=[] # the start offsets which identifies each word/text in the document
entity2=[] # the end offsets which identifies each word/text in the document # the specific texts connected to each label
ids=[] #the ids of each annotation
txt=[]

In [14]:
for i in range (df.shape[0]):
    if df[1][i].startswith('R'):
      en1=df[2][i].split(' ')[1].split(':')[1]
      en2=df[2][i].split(' ')[2].split(':')[1]
      file=df['Text File'][i]
      if (en1 in df[df['Text File']==file][1].tolist()) and (en2 in df[df['Text File']==file][1].tolist()):
        ids.append(df[1][i])
        R.append(df[2][i].split(' ')[0])
        entity1.append(df[2][i].split(' ')[1].split(':')[1])
        entity2.append(df[2][i].split(' ')[2].split(':')[1])
        txt.append(df['Text File'][i])

        #start_offset.append(df[2][i].split(' ')[1])
       # end_offset.append(df[2][i].split(' ')[2].split('\t')[0])

In [15]:
data={'ID':ids,'R':R, 'E1':entity1, 'E2':entity2, 'Text File':txt}
annotation=pd.DataFrame(data)

In [16]:
annotation.head(20)

Unnamed: 0,ID,R,E1,E2,Text File
0,R1,Produces,T1,T2,File 1
1,R2,Is_a,T5,T6,File 1
2,R3,Anaphora,T7,T5,File 1
3,R4,Anaphora,T7,T8,File 1
4,R5,Produces,T14,T13,File 1
5,R1,Is_synon,T1,T2,File 2
6,R2,Produces,T1,T3,File 2
7,R3,Produces,T1,T4,File 2
8,R4,Produces,T1,T5,File 2
9,R5,Produces,T1,T6,File 2


## Step 3

In [17]:
annR=annotation.copy()

During the second step, I'm going to keep the entities and get rid of other types of annotation as Relations (R1 , R2 ...), Attributes (A1, A2..) and Events (E1, E2 ..) if they exist. Then, I'm going to split each line to its corresponding Label , offset and keyword. 

In [18]:
df

Unnamed: 0,1,2,Text File
0,T1,RAREDISEASE 4 32\tautoimmune hemolytic anemias,File 1
1,T2,SIGN 96 105\themolysis,File 1
2,T3,RAREDISEASE 170 196\tAcquired hemolytic anemias,File 1
3,T4,RAREDISEASE 224 263\tIdiopathic acquired autoi...,File 1
4,T5,ANAPHOR 555 574\tthis type of anemia,File 1
...,...,...,...
19093,R9,Produces Arg1:T9 Arg2:T12,File 833
19094,R10,Produces Arg1:T9 Arg2:T13,File 833
19095,R11,Produces Arg1:T9 Arg2:T14,File 833
19096,R12,Produces Arg1:T9 Arg2:T150,File 833


In [19]:
labels=[] #list of labels
start_offset=[] # the start offsets which identifies each word/text in the document
end_offset=[] # the end offsets which identifies each word/text in the document
words=[] # the specific texts connected to each label
ids=[] #the ids of each annotation
txt=[]

In [20]:
for i in range (df.shape[0]):
    if df[1][i].startswith('T'):
        ids.append(df[1][i])
        labels.append(df[2][i].split(' ')[0])
        words.append(df[2][i].split('\t')[1])
        start_offset.append(df[2][i].split(' ')[1])
        end_offset.append(df[2][i].split(' ')[2].split('\t')[0])
        txt.append(df['Text File'][i])

In [21]:
data={'ID':ids,'Label':labels, 'word':words, 'Start offset':start_offset, 'End offset':end_offset, 'Text File':txt}
annotation=pd.DataFrame(data)

In [22]:
annotation.head(10)

Unnamed: 0,ID,Label,word,Start offset,End offset,Text File
0,T1,RAREDISEASE,autoimmune hemolytic anemias,4,32,File 1
1,T2,SIGN,hemolysis,96,105,File 1
2,T3,RAREDISEASE,Acquired hemolytic anemias,170,196,File 1
3,T4,RAREDISEASE,Idiopathic acquired autoimmune diseases,224,263,File 1
4,T5,ANAPHOR,this type of anemia,555,574,File 1
5,T6,DISEASE,anemia,568,574,File 1
6,T7,RAREDISEASE,autoimmune hemolytic anemia,712,739,File 1
7,T8,ANAPHOR,it,768,770,File 1
8,T9,RAREDISEASE,Cold antibody hemolytic anemia,846,876,File 1
9,T10,RAREDISEASE,warm antibody hemolytic anemia,920,950,File 1


In [23]:
annotation.head()

Unnamed: 0,ID,Label,word,Start offset,End offset,Text File
0,T1,RAREDISEASE,autoimmune hemolytic anemias,4,32,File 1
1,T2,SIGN,hemolysis,96,105,File 1
2,T3,RAREDISEASE,Acquired hemolytic anemias,170,196,File 1
3,T4,RAREDISEASE,Idiopathic acquired autoimmune diseases,224,263,File 1
4,T5,ANAPHOR,this type of anemia,555,574,File 1


In [24]:
# remove extra spaces and ending space if any
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def remove_space(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

In [25]:
all_files=[]
for i in all_files1:
  f=i.split('.')
  y=f[0].split('/')
  p='/drive/My Drive/corpus/'+y[4]+'.txt'
  all_files.append(p)

In [26]:
annotation['Sentence']=''
j=0
i=0
for file in all_files:
    j=j+1
    with open(file, 'r', encoding = 'utf-8') as textFile:
        text=textFile.read()
    text= text.replace("\n"," ")
    text= text.replace("("," ")
    text= text.replace(")"," ")
    text= text.replace('“',' ')
    text= text.replace('”',' ')
    text= text.replace('"',' ')
    text = text.replace(". "," .<stop>") 
    text = text.replace("? "," ?<stop>")
    text = text.replace("!"," !<stop>")
    sentences = text.split("<stop>")
    while annotation['Text File'][i]=='File '+str(j):
        b=0
        m=sentences[b]
        while len(m)<int(annotation['Start offset'][i]):
                b=b+1
                m=m+''+sentences[b]
        annotation['Sentence'][i]=sentences[b]
        i=i+1
        if i==annotation.shape[0]:
            break

In [27]:
annotation.head()

Unnamed: 0,ID,Label,word,Start offset,End offset,Text File,Sentence
0,T1,RAREDISEASE,autoimmune hemolytic anemias,4,32,File 1,The autoimmune hemolytic anemias are rare diso...
1,T2,SIGN,hemolysis,96,105,File 1,The autoimmune hemolytic anemias are rare diso...
2,T3,RAREDISEASE,Acquired hemolytic anemias,170,196,File 1,The autoimmune hemolytic anemias are rare diso...
3,T4,RAREDISEASE,Idiopathic acquired autoimmune diseases,224,263,File 1,Acquired hemolytic anemias are non-genetic in ...
4,T5,ANAPHOR,this type of anemia,555,574,File 1,The severity of this type of anemia is determi...


In [28]:
import re
annotation['SentencePre']=''
for i in range (annotation.shape[0]):
    m=annotation['Sentence'][i]
    m= m.replace(","," , ")
    m= m.replace(": "," : ")
    m = m.replace(";"," ;")
    m=remove_space(m)
    annotation['SentencePre'][i]=m
annotation.head()

Unnamed: 0,ID,Label,word,Start offset,End offset,Text File,Sentence,SentencePre
0,T1,RAREDISEASE,autoimmune hemolytic anemias,4,32,File 1,The autoimmune hemolytic anemias are rare diso...,The autoimmune hemolytic anemias are rare diso...
1,T2,SIGN,hemolysis,96,105,File 1,The autoimmune hemolytic anemias are rare diso...,The autoimmune hemolytic anemias are rare diso...
2,T3,RAREDISEASE,Acquired hemolytic anemias,170,196,File 1,The autoimmune hemolytic anemias are rare diso...,The autoimmune hemolytic anemias are rare diso...
3,T4,RAREDISEASE,Idiopathic acquired autoimmune diseases,224,263,File 1,Acquired hemolytic anemias are non-genetic in ...,Acquired hemolytic anemias are non-genetic in ...
4,T5,ANAPHOR,this type of anemia,555,574,File 1,The severity of this type of anemia is determi...,The severity of this type of anemia is determi...


In [29]:
annR.head()

Unnamed: 0,ID,R,E1,E2,Text File
0,R1,Produces,T1,T2,File 1
1,R2,Is_a,T5,T6,File 1
2,R3,Anaphora,T7,T5,File 1
3,R4,Anaphora,T7,T8,File 1
4,R5,Produces,T14,T13,File 1


In [30]:
annR['EN1']=''
annR['EN2']=''
annR['offset1']=''
annR['offset2']=''
for i in range (annR.shape[0]):
  id1=annotation[(annotation['ID']==annR['E1'][i]) & (annotation['Text File']==annR['Text File'][i])].index.values[0]
  id2=annotation[(annotation['ID']==annR['E2'][i]) & (annotation['Text File']==annR['Text File'][i])].index.values[0]
  en1=annotation[(annotation['ID']==annR['E1'][i]) & (annotation['Text File']==annR['Text File'][i])]['word'][id1]
  en2=annotation[(annotation['ID']==annR['E2'][i]) & (annotation['Text File']==annR['Text File'][i])]['word'][id2]
  off1=annotation[(annotation['ID']==annR['E1'][i]) & (annotation['Text File']==annR['Text File'][i])]['Start offset'][id1]
  off2=annotation[(annotation['ID']==annR['E2'][i]) & (annotation['Text File']==annR['Text File'][i])]['Start offset'][id2]
  annR['EN1'][i]=en1
  annR['EN2'][i]=en2
  annR['offset1'][i]=off1
  annR['offset2'][i]=off2

In [31]:
annR.head(20)

Unnamed: 0,ID,R,E1,E2,Text File,EN1,EN2,offset1,offset2
0,R1,Produces,T1,T2,File 1,autoimmune hemolytic anemias,hemolysis,4,96
1,R2,Is_a,T5,T6,File 1,this type of anemia,anemia,555,568
2,R3,Anaphora,T7,T5,File 1,autoimmune hemolytic anemia,this type of anemia,712,555
3,R4,Anaphora,T7,T8,File 1,autoimmune hemolytic anemia,it,712,768
4,R5,Produces,T14,T13,File 1,autoimmune hemolytic anemia,certain antibodies is higher than normal,1306,1236
5,R1,Is_synon,T1,T2,File 2,acoustic neuroma,vestibular schwannoma,3,37
6,R2,Produces,T1,T3,File 2,acoustic neuroma,on the eighth cranial nerve benign (non-cancer...,3,114
7,R3,Produces,T1,T4,File 2,acoustic neuroma,hearing loss in one ear,3,322
8,R4,Produces,T1,T5,File 2,acoustic neuroma,tinnitus,3,491
9,R5,Produces,T1,T6,File 2,acoustic neuroma,dizziness,3,505


In [32]:
annR['contexte']=''
j=0
i=0
for file in all_files:
    j=j+1
    with open(file, 'r', encoding = 'utf-8') as textFile:
        text=textFile.read()
    text= text.replace("\n"," ")
    text= text.replace("("," ")
    text= text.replace(")"," ")
    text= text.replace('“',' ')
    text= text.replace('”',' ')
    text= text.replace('"',' ')
    text = text.replace(". "," .<stop>") 
    text = text.replace("? "," ?<stop>")
    text = text.replace("!"," !<stop>")
    sentences = text.split("<stop>")
    while annR['Text File'][i]=='File '+str(j):
        b=0
        m=sentences[b]
        mi=min(int(annR['offset1'][i]),int(annR['offset2'][i]))
        ma=max(int(annR['offset1'][i]),int(annR['offset2'][i]))
        while len(m)<mi:
                b=b+1
                m=m+''+sentences[b]
        s=sentences[b]
        while len(m)<ma:
                b=b+1
                m=m+''+sentences[b]
                s=s+' '+sentences[b]
        annR['contexte'][i]=s
        i=i+1
        if i==annR.shape[0]:
            break

In [33]:
# remove extra spaces and ending space if any
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def remove_space(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

In [34]:
annR.head(20)

Unnamed: 0,ID,R,E1,E2,Text File,EN1,EN2,offset1,offset2,contexte
0,R1,Produces,T1,T2,File 1,autoimmune hemolytic anemias,hemolysis,4,96,The autoimmune hemolytic anemias are rare diso...
1,R2,Is_a,T5,T6,File 1,this type of anemia,anemia,555,568,The severity of this type of anemia is determi...
2,R3,Anaphora,T7,T5,File 1,autoimmune hemolytic anemia,this type of anemia,712,555,The severity of this type of anemia is determi...
3,R4,Anaphora,T7,T8,File 1,autoimmune hemolytic anemia,it,712,768,When acquired autoimmune hemolytic anemia occu...
4,R5,Produces,T14,T13,File 1,autoimmune hemolytic anemia,certain antibodies is higher than normal,1306,1236,Another blood test Coombs test is used to de...
5,R1,Is_synon,T1,T2,File 2,acoustic neuroma,vestibular schwannoma,3,37,"An acoustic neuroma, also known as a vestibula..."
6,R2,Produces,T1,T3,File 2,acoustic neuroma,on the eighth cranial nerve benign (non-cancer...,3,114,"An acoustic neuroma, also known as a vestibula..."
7,R3,Produces,T1,T4,File 2,acoustic neuroma,hearing loss in one ear,3,322,"An acoustic neuroma, also known as a vestibula..."
8,R4,Produces,T1,T5,File 2,acoustic neuroma,tinnitus,3,491,"An acoustic neuroma, also known as a vestibula..."
9,R5,Produces,T1,T6,File 2,acoustic neuroma,dizziness,3,505,"An acoustic neuroma, also known as a vestibula..."


In [35]:
import re
annR['context']=''
for i in range (annR.shape[0]):
    m=annR['contexte'][i]
    m= m.replace(","," , ")
    m= m.replace(": "," : ")
    m = m.replace(";"," ;")
    m=remove_space(m)
    annR['context'][i]=m

In [36]:
annR['context'].head()

0    The autoimmune hemolytic anemias are rare diso...
1    The severity of this type of anemia is determi...
2    The severity of this type of anemia is determi...
3    When acquired autoimmune hemolytic anemia occu...
4    Another blood test Coombs test is used to dete...
Name: context, dtype: object

In [37]:
text=[]
relation=[]
for i in range(annR.shape[0]):
  result1 = annR['context'][i].find(annR['EN1'][i])
  result2 = annR['context'][i].find(annR['EN2'][i])
  if result1<result2 :
    hash1 = annR['context'][i][:result1] + '<e1>' + annR['context'][i][result1:result1+len(annR['EN1'][i])] + '</e1>' + annR['context'][i][result1+len(annR['EN1'][i]): result2] + '<e2>' + annR['context'][i][result2:result2+len(annR['EN2'][i])]+'</e2>'+ annR['context'][i][result2+len(annR['EN2'][i]):]
    r=annR['R'][i]+'(e1,e2)'
  else:
    hash1 = annR['context'][i][:result2] + '<e1>' + annR['context'][i][result2:result2+len(annR['EN2'][i])] + '</e1>' + annR['context'][i][result2+len(annR['EN2'][i]): result1]  + '<e2>' + annR['context'][i][result1:result1+len(annR['EN1'][i])]+'</e2>'+ annR['context'][i][result1+len(annR['EN1'][i]):]
    r=annR['R'][i]+'(e2,e1)'
  text.append(hash1)
  relation.append(r)

In [38]:
re=pd.DataFrame({
    'text':text,
    'relation':relation
})

In [39]:
re.head(20)

Unnamed: 0,text,relation
0,The <e1>autoimmune hemolytic anemias</e1> are ...,"Produces(e1,e2)"
1,The severity of <e1>this type of anemia</e1><e...,"Is_a(e1,e2)"
2,The severity of <e1>this type of anemia</e1> i...,"Anaphora(e2,e1)"
3,When acquired <e1>autoimmune hemolytic anemia<...,"Anaphora(e1,e2)"
4,Another blood test Coombs test is used to dete...,"Produces(e2,e1)"
5,"An <e1>acoustic neuroma</e1> , also known as a...","Is_synon(e1,e2)"
6,"An acoustic neuroma , also known as a vestibul...","Produces(e2,e1)"
7,"An <e1>acoustic neuroma</e1> , also known as a...","Produces(e1,e2)"
8,"An <e1>acoustic neuroma</e1> , also known as a...","Produces(e1,e2)"
9,"An <e1>acoustic neuroma</e1> , also known as a...","Produces(e1,e2)"


In [40]:
df=re.copy()
df['relation'].unique()

array(['Produces(e1,e2)', 'Is_a(e1,e2)', 'Anaphora(e2,e1)',
       'Anaphora(e1,e2)', 'Produces(e2,e1)', 'Is_synon(e1,e2)',
       'Increases_risk_of(e1,e2)', 'Is_synon(e2,e1)', 'Is_acron(e2,e1)',
       'Is_a(e2,e1)', 'Is_acron(e1,e2)', 'Increases_risk_of(e2,e1)'],
      dtype=object)

In [41]:
df['relation']=df['relation'].str.lower()

In [42]:
df.head()

Unnamed: 0,text,relation
0,The <e1>autoimmune hemolytic anemias</e1> are ...,"produces(e1,e2)"
1,The severity of <e1>this type of anemia</e1><e...,"is_a(e1,e2)"
2,The severity of <e1>this type of anemia</e1> i...,"anaphora(e2,e1)"
3,When acquired <e1>autoimmune hemolytic anemia<...,"anaphora(e1,e2)"
4,Another blood test Coombs test is used to dete...,"produces(e2,e1)"


In [45]:
df.to_csv('re_data.csv')

# Create train and test files before text augmentation

### Split Train / test

In [46]:
from sklearn.model_selection import train_test_split
import numpy as np
X=df['text'].tolist()
y=df['relation'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.25)

In [43]:
df['relation'].value_counts()

produces(e1,e2)             3039
produces(e2,e1)             1177
anaphora(e1,e2)              722
is_a(e1,e2)                  584
anaphora(e2,e1)              415
is_acron(e2,e1)              202
is_a(e2,e1)                  154
increases_risk_of(e1,e2)      88
increases_risk_of(e2,e1)      87
is_synon(e2,e1)               71
is_synon(e1,e2)               21
is_acron(e1,e2)               13
Name: relation, dtype: int64

### Turn train and test set into train and test text files 

In [47]:
import re
# remove extra spaces and ending space if any
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def remove_space(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

In [48]:
pre_train = open("pre_train.txt", "w")
pre_test=open('pre_test.txt',"w")
for i in range (len(X_train)):
  num_sent=i+1
  row=str(num_sent)+"   "+'"'+ (remove_space(X_train[i]).lower()).capitalize() +'"'
  pre_train.write(row)
  pre_train.write("\n")
  pre_train.write(y_train[i])
  pre_train.write("\n")
  pre_train.write("\n")
pre_train.close()
for j in range (len(X_test)):
  number=num_sent+j+1
  row=str(number)+"   "+'"'+(remove_space(X_test[j]).lower()).capitalize() +'"'
  pre_test.write(row)
  pre_test.write("\n")
  pre_test.write(y_test[j])
  pre_test.write("\n")
  pre_test.write("\n")
pre_test.close()

create train dataframe on which we are going to apply text augmentation

In [57]:
df=pd.DataFrame({
    'text': X_train,
    'relation': y_train
})

In [58]:
df.head()

Unnamed: 0,text,relation
0,"Individuals of any age may be affected , but t...","anaphora(e1,e2)"
1,"<e1>Antisynthetase syndrome</e1> is a rare , c...","produces(e1,e2)"
2,"Approximately , 5-10% of individuals who suffe...","increases_risk_of(e1,e2)"
3,Most ENSs occur randomly for no apparent reaso...,"produces(e2,e1)"
4,Subserosal involvement predominantly causes as...,"produces(e1,e2)"


In [59]:
#let's apply text augmentation on the train set (df), to do so I will start a new notebook .
df.to_csv("df.csv")