In [1]:
import re
import pandas as pd
import numpy as np
import PyPDF2

### NaturDoc - TL BL WT 22-23

# Data enrichment:

## Imports:

### Duke:

In [2]:
mod_duke_df = pd.read_csv("../data/DUKE/PIVOT_ETHNOBOT.csv")

### KEW:

##### _Note: reduced and pivoted WCVP datasets:_

As mentioned in the data_transformation notebook, some of the csv files were too large to be uploaded to Github. By creating a copy of wcvp_distribution.csv and wcvp_names.csv containing only columns that were useful to us and by pivoting the entire table, the number of rows and columns was reduced significantly for each file.

In [3]:
dis_df = pd.read_csv("../data/WCVP/STR_REDUCED_PIVOT_wcvp_distribution.csv", sep=",")
names_df = pd.read_csv("../data/WCVP/REDUCED_wcvp_names.csv", sep=",")

### WHO Monographs:

In [4]:
who_01_pdf = open('../data/WHO/monograph_01.pdf', 'rb')

who_01_reader = PyPDF2.PdfReader(who_01_pdf)

who_01_pages = list()

pattern_fi = re.compile("ﬁ")
pattern_fl = re.compile("ﬂ")
pattern_minus = re.compile("–")
pattern_brackets = re.compile("\([\s0-9,-]+\)")
pattern_spacedot = re.compile(" \.")
pattern_spacecomma = re.compile(" ,")
pattern_spacequotesingle = re.compile(" ’")
pattern_spacequotedouble = re.compile(" ”")
pattern_doublespace = re.compile("\s\s+")
pattern_dashspace = re.compile("-\s+")
  
for page in range(len(who_01_reader.pages)):
    page_obj = who_01_reader.pages[page]
    text = page_obj.extract_text()
    text = re.sub(pattern_fi, "fi", text)
    text = re.sub(pattern_fl, "fl", text)
    text = re.sub(pattern_minus, "-", text)
    text = re.sub(pattern_brackets, "", text)
    text = re.sub(pattern_spacedot, ".", text)
    text = re.sub(pattern_spacecomma, ",", text)
    text = re.sub(pattern_spacequotesingle, "’", text)
    text = re.sub(pattern_spacequotedouble, "”", text)
    text = re.sub(pattern_doublespace, " ", text)
    text = re.sub(pattern_dashspace, "", text)
    lines = text.split("\n")
    who_01_pages.append(lines)
    
who_01_pdf.close()

In [5]:
p_01_50 = who_01_pages[7 : 57]
p_51_58 = who_01_pages[75 : 83]
p_59_60 = who_01_pages[57 : 59]
p_61_66 = who_01_pages[69 : 75]
p_67_76 = who_01_pages[59 : 69]
p_77_end = who_01_pages[83 : ]

who_01_pages_corrected = p_01_50 + p_51_58 + p_59_60 + p_61_66 + p_67_76 + p_77_end

## Enriching the Duke dataset:

In [6]:
duke_names = mod_duke_df["TAXON"].unique().tolist()

In [7]:
pattern_formula = re.compile(r"\.[A-Za-z0-9]+.*")

# Definitions with in-keyword:

who_01_def = list()
  
for page in who_01_pages_corrected[3:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Definition" in line:
            n_lines = 1
            while page[i + n_lines] != "Synonyms" and i + n_lines < max_len:
                n_lines += 1
            output = " ".join(page[i + 1 : i + n_lines])

# adding Matricaria chamomilla to Chamomilla recutita's definition to create a match with Duke later
            if "Chamomilla recutita" in output:
                output = "Matricaria chamomilla" + output
            who_01_def.append(output)

# Vernacular names with in-keyword:

who_01_vernacular = list()
reg_pattern = re.compile(r"\.")
  
for page in who_01_pages_corrected[3:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Selected vernacular names" in line:
            n_lines = 1
            while page[i + n_lines] != "Description" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
            else:
                output = " ".join(page[i + 1 : i + n_lines])
            
            output =  output.replace(".", ",")
            
            # Sometimes taxonomic names appear in the WHO document names as a header of sorts:
            for name in duke_names:
                output = output.replace(name, ",")
            
            output =  output.replace(", ", ",")

            who_01_vernacular.append(output)

# Clinical medicine with in-keyword:

who_01_clinical = list()
  
for page in who_01_pages_corrected[3:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses supported by clinical data" in line:
            n_lines = 1
            while "Uses described in pharmacopoeias and in traditional" not in page[i + n_lines] and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            # in case the description gets cut off, drop the incomplete sentence:
            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."
            
            who_01_clinical.append(output)

# Traditional medicine with in-keyword:

who_01_traditional = list()
  
for page in who_01_pages_corrected[3:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses described in pharmacopoeias and in traditional" in line:
            n_lines = 2
            while "Uses described in folk medicine" not in page[i + n_lines] and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 2 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 2 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_traditional.append(output)

# Folk with in-keyword:

who_01_folk = list()
  
for page in who_01_pages_corrected[3:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses described in folk medicine" in line:
            n_lines = 2
            while page[i + n_lines] != "Pharmacology" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 2 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 2 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_folk.append(output)

who_01_con = list()
  
for page in who_01_pages_corrected[4:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Contraindications" in line and "see" not in line.lower():
            n_lines = 1
            while page[i + n_lines] != "Warnings" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_con.append(output)
            
who_01_warn = list()
  
for page in who_01_pages_corrected[4:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Warnings" in line and "see" not in line.lower():
            n_lines = 1
            while page[i + n_lines] != "Precautions" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_warn.append(output)
            
who_01_adv = list()

ref_pattern = re.compile(r"[0-9]+\.")  

for page in who_01_pages_corrected[4:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Adverse reactions" in line and "see" not in line.lower() and not re.search(ref_pattern, line):
            n_lines = 1
            while page[i + n_lines] != "Posology" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_adv.append(output)

who_01_pos = list()
  
for page in who_01_pages_corrected[4:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Posology" in line and "and" not in line.lower():
            n_lines = 1
            while page[i + n_lines] != "References" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                # output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                # output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_01_pos.append(output)

Replace WHO null values with empty string:

In [8]:
null_values = ["No information available.", "None.", "."]

def make_nan(target_list: list):
    for i, value in enumerate(target_list):
        if value in null_values:
            target_list[i] = ""

In [9]:
make_nan(who_01_clinical)
make_nan(who_01_traditional)
make_nan(who_01_folk)
make_nan(who_01_con)
make_nan(who_01_warn)
make_nan(who_01_adv)
make_nan(who_01_pos)

In [10]:
who_01_remedies = list()

for i, definition in enumerate(who_01_def):

    for name in duke_names:

        remedy_dict = dict()
        name = name.split(" ")

        # some taxonomic names have an "x", indicating hybrids
        if "x" in name: 
            name.remove("x")

        if name[0] in definition and name[1] in definition:
            remedy_dict["NAME"] = " ".join(name)
            remedy_dict["DEF"] = definition
            remedy_dict["VERNAC"] = who_01_vernacular[i]
            remedy_dict["CLINICAL"] = who_01_clinical[i]
            remedy_dict["TRADITIONAL"] = who_01_traditional[i] 
            remedy_dict["FOLK"] = who_01_folk[i]
            remedy_dict["CONTRAINDICATION"] = who_01_con[i]
            remedy_dict["WARNING"] = who_01_warn[i]
            remedy_dict["ADVERSE"] = who_01_adv[i]
            remedy_dict["POSOLOGY"] = who_01_pos[i]

        if remedy_dict:
            who_01_remedies.append(remedy_dict)

### Preparing the Duke dataframe:

When reading the modified ETHNOBOT dataset, a new serial index is created, so we need to drop the "Unnamed: 0" column.

In [11]:
mod_duke_df.columns

Index(['Unnamed: 0', 'TAXON', 'ACTIVITY', 'CNAME', 'FAMILY', 'GENUS',
       'SPECIES'],
      dtype='object')

In [12]:
target_cols = ["TAXON", "ACTIVITY", "CNAME", "FAMILY", "GENUS", "SPECIES"]
rich_duke_df = mod_duke_df[target_cols]
rich_duke_df.head()

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES
0,Abelmoschus esculentus,"Abortifacient,Antidote,Boil,Burn,Catarrh,Coffe...","nan,Huang Shu K'Uei,Bamia,Molondron,Quiabeiro,...",Malvaceae,Abelmoschus,esculentus
1,Abelmoschus manihot,"Antitussive,Boil,Cancer,Cancer(Stomach),Catarr...","Tororo-Aoi,nan,Kastuli",Malvaceae,Abelmoschus,manihot
2,Abelmoschus moschatus,"Aphrodisiac,Asthma,Bite(Snake),Boil,Cancer,Car...","Musk Mallow,nan,Moskus,Muskus,Mushk Dana,Amber...",Malvaceae,Abelmoschus,moschatus
3,Abies alba,"Bronchitis,Bruise,Calculus,Catarrh,Cough,Diure...","Abeto,Edeltanne,Beyaz Koknar,Silver Fir,Abeto ...",Pinaceae,Abies,alba
4,Abies balsamea,"Burn,Cancer,Cold,Cough,Heart,Masticatory,Sore,...","nan,Balsam,Canada,Balsam Fir,Fir",Pinaceae,Abies,balsamea


### Adding data from WHO monographs to the reshaped dataset:

Preparing new columns in the enriched dataframe for the information stored in the remedy dictionary. Since we previously replaced all np.NaN values with a string "nan", the default value will be the same here:

In [13]:
rich_duke_df["VERNAC"] = "nan"
rich_duke_df["CLINICAL"] = "nan"
rich_duke_df["TRADITIONAL"] = "nan"
rich_duke_df["FOLK"] = "nan"
rich_duke_df["CONTRAINDICATION"] = "nan"
rich_duke_df["WARNING"] = "nan"
rich_duke_df["ADVERSE"] = "nan"
rich_duke_df["POSOLOGY"] = "nan"

In [14]:
rich_duke_df.head()

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
0,Abelmoschus esculentus,"Abortifacient,Antidote,Boil,Burn,Catarrh,Coffe...","nan,Huang Shu K'Uei,Bamia,Molondron,Quiabeiro,...",Malvaceae,Abelmoschus,esculentus,,,,,,,,
1,Abelmoschus manihot,"Antitussive,Boil,Cancer,Cancer(Stomach),Catarr...","Tororo-Aoi,nan,Kastuli",Malvaceae,Abelmoschus,manihot,,,,,,,,
2,Abelmoschus moschatus,"Aphrodisiac,Asthma,Bite(Snake),Boil,Cancer,Car...","Musk Mallow,nan,Moskus,Muskus,Mushk Dana,Amber...",Malvaceae,Abelmoschus,moschatus,,,,,,,,
3,Abies alba,"Bronchitis,Bruise,Calculus,Catarrh,Cough,Diure...","Abeto,Edeltanne,Beyaz Koknar,Silver Fir,Abeto ...",Pinaceae,Abies,alba,,,,,,,,
4,Abies balsamea,"Burn,Cancer,Cold,Cough,Heart,Masticatory,Sore,...","nan,Balsam,Canada,Balsam Fir,Fir",Pinaceae,Abies,balsamea,,,,,,,,


Concatenating the dictionary values to the correct dataframe column:

In [15]:
def store_dict_values(remedy: dict, key: str):
    name_filt = (rich_duke_df["TAXON"] == remedy["NAME"])
    if  "nan" in rich_duke_df.loc[name_filt,[key]].values:
        rich_duke_df.loc[name_filt,[key]] = remedy[key]
    else:
        rich_duke_df.loc[name_filt,[key]] = rich_duke_df.loc[name_filt,[key]] + remedy[key]

In [16]:
skip_keys = ["NAME", "DEF"]

for i, herb in enumerate(who_01_remedies):
    for key in herb.keys():
        if key in skip_keys:
            continue
        store_dict_values(herb, key)

To better gauge the success of the previous code, we filter the dataframe to target one of the herbs that we know was in the monograph:

In [17]:
filt_alliumsativum = (rich_duke_df["TAXON"] == "Allium sativum")
rich_duke_df.loc[filt_alliumsativum]

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
473,Allium sativum,"Ache(Ear),Ache(Stomach),Alopecia,Antidote,Anti...","nan,Thum,Suan,Hsiao Suan,Sarimsak,Ail,Sir,Ajo,...",Liliaceae,Allium,sativum,"It is most commonly known as “garlic”,Ail,ail ...",As an adjuvant to dietetic management in the t...,The treatment of respiratory and urinary tract...,"As an aphrodisiac, antipyretic, diuretic, emme...",Bulbus Allii Sativi is contraindicated in pati...,Consumption of large amounts of garlic may inc...,Bulbus Allii Sativi has been reported to evoke...,"Unless otherwise prescribed, average daily dos..."


Verifying that chamomille is also added to the dataset now:

In [18]:
filt_chamomilla = (rich_duke_df["TAXON"] == "Matricaria chamomilla")
rich_duke_df.loc[filt_chamomilla]

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
7564,Matricaria chamomilla,"Ache(Ear),Ache(Stomach),Anodyne,Antiphlogistic...","nan,Papatya,Babunnej,Camomile,Manzanilla,Chamo...",Asteraceae,Matricaria,chamomilla,"Baboonig,babuna,babunah camomile,babunj,bunga ...",Internal use Symptomatic treatment of digestiv...,Adjuvant in the treatment of minor in flammato...,"As an antibacterial and antiviral agent, an em...",Camomile is contraindicated in patients with a...,,The presence of lactones in Flos Chamomillae-b...,Internal use Adult dose of flower head: averag...


As we can see, this code successfully added the dictionary values to the dataframe.

### Opening, loading and reading the remaining monographs:

Since every monograph necessitates a lot of modifications of the extraction algorithm, we will only extract the data from the first two monographs out of the four available. As a proof of concept, we think this suffices and already provide in-depth data for up to 70 herbs in the Duke dataset.

In [19]:
# 2:

who_02_pdf = open('../data/WHO/monograph_02.pdf', 'rb')

who_02_reader = PyPDF2.PdfReader(who_02_pdf)

who_02_pages = list()

pattern_fi = re.compile("ﬁ")
pattern_fl = re.compile("ﬂ")
pattern_minus = re.compile("–")
pattern_brackets = re.compile("\([\s0-9,-]+\)")
pattern_spacedot = re.compile(" \.")
pattern_spacecomma = re.compile(" ,")
pattern_doublespace = re.compile("\s\s+")
pattern_dashspace = re.compile("-\s+")
  
for page in range(len(who_02_reader.pages)):
    page_obj = who_02_reader.pages[page]
    text = page_obj.extract_text()
    text = re.sub(pattern_fi, "fi", text)
    text = re.sub(pattern_fl, "fl", text)
    text = re.sub(pattern_minus, "-", text)
    text = re.sub(pattern_brackets, "", text)
    text = re.sub(pattern_spacedot, ".", text)
    text = re.sub(pattern_spacecomma, ",", text)
    text = re.sub(pattern_doublespace, " ", text)
    text = re.sub(pattern_dashspace, "", text)
    lines = text.split("\n")
    who_02_pages.append(lines)
    
who_02_pdf.close()

# 3:

who_03_pdf = open('../data/WHO/monograph_03.pdf', 'rb')

who_03_reader = PyPDF2.PdfReader(who_03_pdf)

who_03_pages = list()

pattern_fi = re.compile("ﬁ")
pattern_fl = re.compile("ﬂ")
pattern_minus = re.compile("–")
pattern_brackets = re.compile("\([\s0-9,-]+\)")
pattern_spacedot = re.compile(" \.")
pattern_spacecomma = re.compile(" ,")
pattern_doublespace = re.compile("\s\s+")
pattern_dashspace = re.compile("-\s+")
  
for page in range(len(who_03_reader.pages)):
    page_obj = who_03_reader.pages[page]
    text = page_obj.extract_text()
    text = re.sub(pattern_fi, "fi", text)
    text = re.sub(pattern_fl, "fl", text)
    text = re.sub(pattern_minus, "-", text)
    text = re.sub(pattern_brackets, "", text)
    text = re.sub(pattern_spacedot, ".", text)
    text = re.sub(pattern_spacecomma, ",", text)
    text = re.sub(pattern_doublespace, " ", text)
    text = re.sub(pattern_dashspace, "", text)
    lines = text.split("\n")
    who_03_pages.append(lines)
    
who_03_pdf.close()

# 4:

who_04_pdf = open('../data/WHO/monograph_04.pdf', 'rb')

who_04_reader = PyPDF2.PdfReader(who_04_pdf)

who_04_pages = list()

pattern_fi = re.compile("ﬁ")
pattern_fl = re.compile("ﬂ")
pattern_minus = re.compile("–")
pattern_brackets = re.compile("\([\s0-9,-]+\)")
pattern_spacedot = re.compile(" \.")
pattern_spacecomma = re.compile(" ,")
pattern_doublespace = re.compile("\s\s+")
pattern_dashspace = re.compile("-\s+")
  
for page in range(len(who_04_reader.pages)):
    page_obj = who_04_reader.pages[page]
    text = page_obj.extract_text()
    text = re.sub(pattern_fi, "fi", text)
    text = re.sub(pattern_fl, "fl", text)
    text = re.sub(pattern_minus, "-", text)
    text = re.sub(pattern_brackets, "", text)
    text = re.sub(pattern_spacedot, ".", text)
    text = re.sub(pattern_spacecomma, ",", text)
    text = re.sub(pattern_doublespace, " ", text)
    text = re.sub(pattern_dashspace, "", text)
    lines = text.split("\n")
    who_04_pages.append(lines)
    
who_04_pdf.close()

In [20]:
pattern_formula = re.compile(r"\.[A-Za-z0-9]+.*")

### Extracting data from the second WHO monograph:

In [21]:
# 2:

# Definitions with in-keyword:

who_02_def = list()
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Definition" in line:
            n_lines = 1
            while page[i + n_lines] != "Synonyms" and i + n_lines < max_len:
                n_lines += 1
            
            output = " ".join(page[i + 1 : i + n_lines])

# adding Matricaria chamomilla to Chamomilla recutita's definition to create a match with Duke later
            if "Tanacetum parthenium" in output:
                output = "Chrysanthemum parthenium" + output
            who_02_def.append(output)

# Vernacular names with in-keyword:

who_02_vernacular = list()
reg_pattern = re.compile(r"\.")
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Selected vernacular names" in line:
            n_lines = 1
            while page[i + n_lines] != "Description" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
            else:
                output = " ".join(page[i + 1 : i + n_lines])
            
            output =  output.replace(".", "")
            
            # Sometimes taxonomic names appear in the WHO document names as a header of sorts:
            for name in duke_names:
                output = output.replace(name, ",")
            
            who_02_vernacular.append(output)

# Clinical medicine with in-keyword:

who_02_clinical = list()
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses supported by clinical data" in line:
            n_lines = 1
            while "Uses described in pharmacopoeias and in traditional" not in page[i + n_lines] and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            # in case the description gets cut off, drop the incomplete sentence:
            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."
            
            who_02_clinical.append(output)

# Traditional medicine with in-keyword:

who_02_traditional = list()
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses described in pharmacopoeias" in line or "Uses reported in pharmacopoeias" in line:
            n_lines = 1

            while "Uses described in folk medicine" not in page[i + n_lines] and i + n_lines < max_len:
                n_lines += 1

            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if not output:
                print(line)
                print(page[i + 1])
                print(page[i + n_lines])
                print(page[i + n_lines + 1])

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_traditional.append(output)

# Folk with in-keyword:

who_02_folk = list()
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Uses described in folk medicine" in line:
            n_lines = 1
            while page[i + n_lines] != "Pharmacology" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_folk.append(output)
            
who_02_con = list()

for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Contraindications" in line and ".)" not in line.lower() and "and" not in line.lower() and "see" not in line.lower():
            n_lines = 1
            
            while page[i + n_lines] != "Warnings" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if ". Warnings" in output:
                output = "".join(output.split(". Warnings")[0])
                output = output + "."

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_con.append(output)

            
who_02_warn = list()

warning_exception = "androgen and estrogen metabolism. Warnings"
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Warnings" in line and "and" not in line.lower() and "see" not in line.lower() or warning_exception in line:
            n_lines = 1
            while page[i + n_lines] != "Precautions" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_warn.append(output)
            
who_02_adv = list()

ref_pattern = re.compile(r"[0-9]+\.")  

for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Adverse reactions" in line and "in" not in line.lower() and "and" not in line.lower() and "see" not in line.lower() and not re.search(ref_pattern, line):
            n_lines = 1
            while page[i + n_lines] != "Dosage forms" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_adv.append(output)

who_02_pos = list()
  
for page in who_02_pages[10:]:
    max_len = len(page) - 1
    for i, line in enumerate(page):
        if "Posology" in line and "and" not in line.lower() and "see" not in line.lower():
            n_lines = 1
            while page[i + n_lines] != "References" and i + n_lines < max_len:
                n_lines += 1
            if i + n_lines == max_len:
                output = " ".join(page[i + 1 : i + n_lines + 1])
                # output = re.sub(pattern_formula, ".", output)
            else:
                output = " ".join(page[i + 1 : i + n_lines])
                # output = re.sub(pattern_formula, ".", output)

            if "." != output[-1]:
                output = output.split(".")
                output = output[:-1]
                output = ".".join(output)
                output = output + "."

            who_02_pos.append(output)

#### Remove nan values:

In [22]:
make_nan(who_02_clinical)
make_nan(who_02_traditional)
make_nan(who_02_folk)
make_nan(who_02_con)
make_nan(who_02_warn)
make_nan(who_02_adv)
make_nan(who_02_pos)

#### Verify lengths:

In [23]:
print(len(who_02_def))
print(len(who_02_clinical))
print(len(who_02_traditional))
print(len(who_02_folk))
print(len(who_02_con))
print(len(who_02_warn))
print(len(who_02_adv))
print(len(who_02_pos))

30
30
30
30
30
30
30
30


### Generating the dictionary:

In [24]:
who_02_remedies = list()

for i, definition in enumerate(who_02_def):

    for name in duke_names:

        remedy_dict = dict()
        name = name.split(" ")

        # some taxonomic names have an "x", indicating hybrids
        if "x" in name: 
            name.remove("x")

        if name[0] in definition and name[1] in definition:
            remedy_dict["NAME"] = " ".join(name)
            remedy_dict["DEF"] = definition
            remedy_dict["VERNAC"] = who_02_vernacular[i]
            remedy_dict["CLINICAL"] = who_02_clinical[i]
            remedy_dict["TRADITIONAL"] = who_02_traditional[i] 
            remedy_dict["FOLK"] = who_02_folk[i]
            remedy_dict["CONTRAINDICATION"] = who_02_con[i]
            remedy_dict["WARNING"] = who_02_warn[i]
            remedy_dict["ADVERSE"] = who_02_adv[i]
            remedy_dict["POSOLOGY"] = who_02_pos[i]

        if remedy_dict:
            who_02_remedies.append(remedy_dict)

#### Checking for missing herbs:

In [25]:
matches = list()

for definition in who_02_def:
    for herb in who_02_remedies:
        if definition in herb["DEF"]:
            matches.append(definition)

rest = list()

for definition in who_02_def:
    if definition not in matches:
        rest.append(definition)

print("Length of matches:", len(matches), "\nLength of rest:", len(rest), end="\n\n")
print(*rest, sep="\n")

Length of matches: 34 
Length of rest: 2

Radix Eleutherococci consists of the dried roots and rhizomes o f Eleutherococcus senticosus (Rupr. and Maxim.) Maxim. (Araliaceae).1
Cortex Pruni Africanae consists of the dried bark of the trunk of Prunus africana (Hook. f.) Kalkman (Rosaceae).


Checking the Duke dataset for matches of other synonyms of the plants, we have found _Tanacetum parthenium_ listed under its synonym _Chrysanthemum parthenium_:

In [26]:
filt_tanacetumparthenium = (rich_duke_df["TAXON"].str.contains("parthenium"))
rich_duke_df.loc[filt_tanacetumparthenium]

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
2564,Chrysanthemum parthenium,"Abortifacient,Ache(Ear),Anemia,Aperient,Cancer...","nan,Varadika,Amargosa,Manzanilla,Matricaria",Asteraceae,Chrysanthemum,parthenium,,,,,,,,


A code similar to the one that ensured a match for chamomile was already included.

### Adding data from WHO monographs to the reshaped dataset:

In [27]:
skip_keys = ["NAME", "DEF"]

for i, herb in enumerate(who_02_remedies):
    for key in herb.keys():
        if key in skip_keys:
            continue
        store_dict_values(herb, key)

Verifying one of the added herbs:

In [28]:
filt_chrysanthemumparthenium = (rich_duke_df["TAXON"] == "Chrysanthemum parthenium")
rich_duke_df.loc[filt_chrysanthemumparthenium]

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
2564,Chrysanthemum parthenium,"Abortifacient,Ache(Ear),Anemia,Aperient,Cancer...","nan,Varadika,Amargosa,Manzanilla,Matricaria",Asteraceae,Chrysanthemum,parthenium,"Acetilla, âghovân, alfinetes de senhora, altam...",Prevention of migraine. Although Herba Tanacet...,,"Treatment of anaemia, arthritis, asthma, commo...",Herba Tanaceti Parthenii is contraindicated in...,,"Dizziness, heartburn, indigestion, inflammatio...",(Unless otherwise indicated) Daily dosage: enc...


#### NAN values:

Replace string nan and empty values with proper np.nan:

In [33]:
nan_toreplace = ["nan", ""]
rich_duke_df = rich_duke_df.replace(nan_toreplace, np.nan)

filt_chrysanthemumparthenium = (rich_duke_df["TAXON"] == "Chrysanthemum parthenium")
rich_duke_df.loc[filt_chrysanthemumparthenium]

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
2564,Chrysanthemum parthenium,"Abortifacient,Ache(Ear),Anemia,Aperient,Cancer...","nan,Varadika,Amargosa,Manzanilla,Matricaria",Asteraceae,Chrysanthemum,parthenium,"Acetilla, âghovân, alfinetes de senhora, altam...",Prevention of migraine. Although Herba Tanacet...,,"Treatment of anaemia, arthritis, asthma, commo...",Herba Tanaceti Parthenii is contraindicated in...,,"Dizziness, heartburn, indigestion, inflammatio...",(Unless otherwise indicated) Daily dosage: enc...


Replace the string nan values created from pivoting the table with an aggregate function:

In [56]:
rich_duke_df.ACTIVITY = rich_duke_df.ACTIVITY.str.replace('nan,', '', regex=True)
rich_duke_df.CNAME = rich_duke_df.CNAME.str.replace(',nan', '', regex=True)
rich_duke_df.ACTIVITY = rich_duke_df.ACTIVITY.str.replace('nan,', '', regex=True)
rich_duke_df.CNAME = rich_duke_df.CNAME.str.replace(',nan', '', regex=True)

In [58]:
rich_duke_df.ACTIVITY

0        Abortifacient,Antidote,Boil,Burn,Catarrh,Coffe...
1        Antitussive,Boil,Cancer,Cancer(Stomach),Catarr...
2        Aphrodisiac,Asthma,Bite(Snake),Boil,Cancer,Car...
3        Bronchitis,Bruise,Calculus,Catarrh,Cough,Diure...
4        Burn,Cancer,Cold,Cough,Heart,Masticatory,Sore,...
                               ...                        
13074                                             Diarrhea
13075                                             Diuretic
13076                                               Emetic
13077                    Spasm,Vermifuge,Depurative,Poison
13078                         Leucoma,Ophthalmia,Vermifuge
Name: ACTIVITY, Length: 13079, dtype: object

In [59]:
rich_duke_df

Unnamed: 0,TAXON,ACTIVITY,CNAME,FAMILY,GENUS,SPECIES,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
0,Abelmoschus esculentus,"Abortifacient,Antidote,Boil,Burn,Catarrh,Coffe...","Huang Shu K'Uei,Bamia,Molondron,Quiabeiro,Gomb...",Malvaceae,Abelmoschus,esculentus,,,,,,,,
1,Abelmoschus manihot,"Antitussive,Boil,Cancer,Cancer(Stomach),Catarr...","Tororo-Aoi,Kastuli",Malvaceae,Abelmoschus,manihot,,,,,,,,
2,Abelmoschus moschatus,"Aphrodisiac,Asthma,Bite(Snake),Boil,Cancer,Car...","Musk Mallow,Moskus,Muskus,Mushk Dana,Ambercice...",Malvaceae,Abelmoschus,moschatus,,,,,,,,
3,Abies alba,"Bronchitis,Bruise,Calculus,Catarrh,Cough,Diure...","Abeto,Edeltanne,Beyaz Koknar,Silver Fir,Abeto ...",Pinaceae,Abies,alba,,,,,,,,
4,Abies balsamea,"Burn,Cancer,Cold,Cough,Heart,Masticatory,Sore,...","Balsam,Canada,Balsam Fir,Fir",Pinaceae,Abies,balsamea,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13074,Zostera marina,Diarrhea,,Zosteraceae,Zostera,marina,,,,,,,,
13075,Zuelania guidonia,Diuretic,Guuaguasi,Flacourtiaceae,Zuelania,guidonia,,,,,,,,
13076,Zuelania roussoviana,Emetic,,Flacourtiaceae,Zuelania,roussoviana,,,,,,,,
13077,Zygophyllum fabago,"Spasm,Vermifuge,Depurative,Poison","Khennaiq Ad Dijaj,Morsana",Zygophyllaceae,Zygophyllum,fabago,,,,,,,,


#### Checking the enriched entries:

In [60]:
temp_df = rich_duke_df.dropna(subset=["VERNAC"])
temp_df.loc[:, ["TAXON", "ACTIVITY", "CNAME", "VERNAC", "CLINICAL", "TRADITIONAL", "FOLK", "CONTRAINDICATION", "WARNING", "ADVERSE", "POSOLOGY"]]

Unnamed: 0,TAXON,ACTIVITY,CNAME,VERNAC,CLINICAL,TRADITIONAL,FOLK,CONTRAINDICATION,WARNING,ADVERSE,POSOLOGY
307,Aesculus hippocastanum,"Analgesic,Anodyne,Astringent,Back,Fever,Narcot...","Paarde Kastanje,Ippocastano,Chataigne De Cheva...","Abu farwat el hhussan, castagna amare, castagn...","Internally, for treatment of symptoms of chron...",Treatment of coronary heart disease.,Treatment of bacillary dysentery and fevers. A...,Semen Hippocastani is contraindicated in cases...,,Case reports have indicated gastrointestinal s...,(Unless otherwise indicated) Daily dosage: 250...
459,Allium cepa,"Abscess,Ache(Ear),Ache(Stomach),Albuminuria,Al...","Hu Ts'Ung,Oignon,Cebolla,Sogan,Basal,Cabolla,O...","It is most commonly known as “onion”,Basal,bas...",The principal use of Bulbus Allii Cepae today ...,Treatment of bacterial infections such as dyse...,"As an anthelminthic, aphrodisiac, carminative,...",Allergies to the plant. The level of safety of...,No warnings have been reported.,Allergic reactions such as rhinoconjunctivitis...,Unless otherwise prescribed: a daily dosage is...
473,Allium sativum,"Ache(Ear),Ache(Stomach),Alopecia,Antidote,Anti...","Thum,Suan,Hsiao Suan,Sarimsak,Ail,Sir,Ajo,Garl...","It is most commonly known as “garlic”,Ail,ail ...",As an adjuvant to dietetic management in the t...,The treatment of respiratory and urinary tract...,"As an aphrodisiac, antipyretic, diuretic, emme...",Bulbus Allii Sativi is contraindicated in pati...,Consumption of large amounts of garlic may inc...,Bulbus Allii Sativi has been reported to evoke...,"Unless otherwise prescribed, average daily dos..."
508,Aloe africana,Tumor,Aloes,"Aloe capensis,aloe curacao,aloe vera,aloes,alo...",Short-term treatment of occasional constipation.,,"Treatment of seborrhoeic dermatitis, peptic ul...","As with other stimulant laxatives, products co...",Aloe-containing products should be used only i...,Abdominal spasms and pain may occur after even...,The correct individual dose is the smallest am...
515,Aloe ferox,"Aperient,Tumor,Medicine",Aloes Do Cabo,"Aloe capensis,aloe curacao,aloe vera,aloes,alo...",Short-term treatment of occasional constipation.,,"Treatment of seborrhoeic dermatitis, peptic ul...","As with other stimulant laxatives, products co...",Aloe-containing products should be used only i...,Abdominal spasms and pain may occur after even...,The correct individual dose is the smallest am...
...,...,...,...,...,...,...,...,...,...,...,...
12460,Urtica dioica,"Ache,Ache(Back),Alopecia,Anodyne,Asthma,Ataxia...","Isirgan,Bigstring Nettle,Stinging Nettle,Gerra...",", L Brennesselwurzel, common nettle, csalángyö...",Symptomatic treatment of lower urinary tract d...,As a diuretic and for the treatment of rheumat...,"Treatment of asthma, coughs, dandruff, diabete...",Radix Urticae is contraindicated in cases of k...,Radix Urticae relieves the symptoms associated...,Clinical studies have shown that extracts of R...,(Unless otherwise indicated) Daily dosage: 4-6...
12470,Urtica urens,"Cancer,Lactogogue,Tumor,Diuretic","Ortiga Commun,Ortiga Menor,Dog Nettle,Ortiga",", L Brennesselwurzel, common nettle, csalángyö...",Symptomatic treatment of lower urinary tract d...,As a diuretic and for the treatment of rheumat...,"Treatment of asthma, coughs, dandruff, diabete...",Radix Urticae is contraindicated in cases of k...,Radix Urticae relieves the symptoms associated...,Clinical studies have shown that extracts of R...,(Unless otherwise indicated) Daily dosage: 4-6...
12510,Valeriana officinalis,"Ache(Back),Bruise,Carminative,Fever,Flu,Hypoch...","Valerian,Kediotu,Seiyo-Kanoko-So,Valeriana","All heal,akar pulepandak,amantilla,balderbrack...",As a mild sedative and sleep-promoting agent. ...,"As a digestive aid, and an adjuvant in spasmol...","To treat epilepsy, gum sores, headaches, nause...",Radix Valerianae should not be used during pre...,,Minor side-effects have been associated with c...,"Dried root and rhizome, 2 -3g drug per cup by ..."
13028,Zingiber officinale,"Ache(Stomach),Alopecia,Amenorrhea,Anodyne,Anti...","Chiang,Gingembre,Sheng Chiang,Ginger,Gan Jiang...","Ada,adrak,adu,African ginger,ajenjibre,ale,ale...",The prophylaxis of nausea and vomiting associa...,"The treatment of dyspepsia, flatulence, colic,...","To treat cataracts, toothache, insomnia, baldn...",,,Contact dermatitis of the finger tips has been...,For motion sickness in adults and children mor...


## Export dataframe to json:

Exporting to .json file (different formats, see https://datatofish.com/export-pandas-dataframe-json/):

In [61]:
rich_duke_df.to_json(r"../output/remedies.json", orient = "records")