In [2]:
import csv
import pandas as pd
import numpy as np
from owlready2 import *

In [3]:
# desired header order:
 # iri	iri label	pathogen	pathogen label	host	host label	plantstructure	plantstructure label

 # iri  (OOPS:xxx)
 # iri_label (disease common name)
 # pathogen (NCBITaxon:xxxxxx)
 # pathogen_label ( pathogen name )
 # host (NCBITaxon:xxxxxx)
 # host_label (host common name)
 # plantstructure (PO:xxx)
 # plantstructure_label (common name)


In [4]:
raw_scrape_file = 'aps_with_po_mappings.tsv'

In [5]:
twelve = 0
fourteen = 0
other = 0

# Name formatting function

It will be important to have a function that can dynamically name the diseases without too much repetition.
The function will be fed a name (full of weird parenthesis and references), a pathogen (from NCBI), a host (also from NCBI), and a plant part (whole plant is used as a default).

if a plant part is included, that plant part should be used as part of the name.  But if the 'part' is the whole plant, that part is not to be included in the name.

The important part of this function is that it doesn't do any of the column parsing or anything like dataframe stuff.  All reading of OG files and splitting of them into dataframes should be done outside of this function, so this function can be reused, no mater how the original data is parsed.

In [6]:
def disease_name_formatter(rawname,host,pathogen,part):
    """
    function to format the names of diseases.  
    Requires:
    rawname - the name that was given in the APS scrape
    host - obvious
    pathogen - obvious
    part - if not mentioned explicitly in 'aps_with_po_mappings' (from MAL), should defalt to "whole plant"
    
    """
    # start by striping any bracketed reference, and whitespace.
    short_name = rawname.split('(')[0].strip()
    pathogenname = pathogen.split('(')[0].strip()
    hostname = host.split('()')[0].strip()
    new_name = None
    
    # check that the pieces are all strings
    
    
    if pathogenname in short_name and host in short_name:
        new_name = short_name
        print('A -',new_name)
        # return('A')
    elif pathogenname in short_name:
        new_name = '{aps_name} of {host}'.format(aps_name=short_name, host=host)
        print('B -', new_name)
#         return('B')
    elif host in short_name:
        new_name = '{pathogen} {aps_name}'.format(pathogen=pathogenname,aps_name=short_name)
        print('C -', new_name)
#         return('C')
    else:
        new_name = '{aps_name} {pathogen} of {host}'.format(aps_name=short_name, pathogen=pathogenname, host=host)
        print('D -', new_name )
#         return('D')
    return(new_name)


##### ORIGINAL RESULTS #####
# A - 33
# B - 2252
# C - 511
# D - 8956
# 11752

In [7]:
# print the first line of the file:  Shows there is no header.
with open(raw_scrape_file,'r') as infile:
    first_line = infile.readline()
    line = first_line.split('\t')
    print(line)
    print(len(line))

['Lenticel spotting', 'Heavy rains or prolonged post-harvest dips', 'Heavy rains or prolonged post-harvest dips', 'no:match', 'pathogen of', 'http://purl.obolibrary.org/obo/RO_0002556', 'Diseases of Mango (Mangifera indica L.)', 'Mangifera indica L.', 'NCBITaxon:29780', 'K. Pernezny and G. W. Simone, primary collators (last update 11/09/00). Diseases of Mango (Mangifera indica L.). The American Phytopathological Society. Accessed on 2016-09-07 at http://www.apsnet.org/publications/commonnames/Pages/Mango.aspx', 'http://www.apsnet.org/publications/commonnames/Pages/Mango.aspx', '09/07/2016', 'http://purl.obolibrary.org/obo/PO_0000031', 'lenticel\n']
14


In [8]:
names = None

with open(raw_scrape_file,'r') as infile:
    counts = []
    for line in infile:
        # print(len(line.split('\t')))
        l = line.split('\t')
        # print(l[0])
        name = disease_name_formatter(l[0],l[7],l[2],"leaf")  #passing in 'leaf' as the default.
        counts.append(name)
        if len(line.split('\t')) == 12:
            twelve += 1
        elif len(line.split('\t')) == 14:
            fourteen += 1
        else:
            other += 1
    names = counts

#     print(names)

D - Lenticel spotting Heavy rains or prolonged post-harvest dips of Mangifera indica L.
D - Lenticel proliferation Extended period of soil flooding of Ipomoea batatas [L.] Lam.
D - Infectious bud failure Prunus necrotic ring spot virus of Mill.) Webb
D - Peach yellow bud mosaic Tomato ringspot virus of Mill.) Webb
D - Noninfectious bud failure = Crazy top Genetic of Mill.) Webb
D - Bud and twig blight Pycnostysanus azalea of Rhododendron spp.
D - Bud and twig blight Briosia azaleae Peck of Rhododendron spp.
D - Fusarium bud rot Fusarium tricinctum of Dianthus caryophylium L.
D - Bud union crease Virus for some combinations of Citrus spp.
B - Bud union crease of Citrus spp.
B - Bud union crease of Citrus spp.
D - Bacterial bud rot Erwinia spp. of Cocos nucifera L.
D - Bud rot Phytophthora palmivora of Cocos nucifera L.
D - Bud rot Phytophthora heveae A. Thompson of Cocos nucifera L.
D - Bud rot Phytophthora katsurae Ko & Chang of Cocos nucifera L.
D - Bud rot Phytophthora nicotianae Bre

D - Rusty root Olpidium brassicae of Hoffm.) Arcang.
B - Rusty root of Hoffm.) Arcang.
B - Rusty root of Hoffm.) Arcang.
D - Violet root rot Helicobasidium brebissonii of Hoffm.) Arcang.
D - Violet root rot Rhizoctonia crocorum of Hoffm.) Arcang.
B - Violet root rot of Hoffm.) Arcang.
D - Root knot Meloidogyne hapla Chitwood of Hoffm.) Arcang.
D - Root scab Physiological of Hoffm.) Arcang.
D - Armillaria root rot Armillaria mellea of Manihot esculenta Crantz
B - Armillaria root rot of Manihot esculenta Crantz
D - Armillaria root rot Rhizomorpha subcorticalis Pers. of Manihot esculenta Crantz
D - Black root and stem rot Scytalidium sp. of Manihot esculenta Crantz
D - Black root and stem rot Hendersonula toruloidea Nattras of Manihot esculenta Crantz
D - Black root and stem rot Hendersonula toruloidea Nattras of Manihot esculenta Crantz
D - Dematophora root rot Dematophora necatrix R. Hartig of Manihot esculenta Crantz
D - Dematophora root rot Rosellinia necatrix Prill. of Manihot escule

C - Cylindrocarpon tenue Bugnicourt Root rot
C - Cylindrocladiella camelliae Root rot
C - Cylindrocladium camelliae Venkataramani & Venkata Ram Root rot
C - Cylindrocladium clavatum C. S. Hodges & L.C. May Root rot
C - Fomes lamaoensis Root rot
C - Ganoderma applanatum Root rot
C - Ganoderma lucidum Root rot
A - Root rot
C - Hypoxylon asarcodes Tarry root rot
C - Sphaerostilbe repens Berk. & Broome Violet root rot
C - Rigidoporus microporus White root rot
A - White root rot
C - Xylaria sp. Xylaria root rot
D - Black root rot Thielaviopsis basicola of Nicotiana tabacum L.
D - Brown root rot Pratylenchus spp. of Nicotiana tabacum L.
D - Root-knot Meloidogyne arenaria of Nicotiana tabacum L.
D - Root-knot Meloidogyne hapla Chitwood of Nicotiana tabacum L.
D - Root-knot Meloidogyne incognita of Nicotiana tabacum L.
D - Root-knot Meloidogyne javanica of Nicotiana tabacum L.
D - Stubby-root Paratrichodorus spp. of Nicotiana tabacum L.
D - Stubby-root Trichodorus spp. of Nicotiana tabacum L.


D - Stigmina leaf spot Stigmina palmivora of Cocos nucifera L.
D - Leaf scorch decline Not known of Cocos nucifera L.
D - Algal Cephaleuros virescens Kunze of Coffea arabica
D - Algal Cephaleuros virescens Kunze of Coffea canephora
D - Brown leaf spot Phoma costarricensis Ech. of Coffea arabica
D - Brown leaf spot Phoma costarricensis Ech. of Coffea canephora
D - Leaf blight Ascochyta tarda Stewart of Coffea arabica
D - Leaf blight Ascochyta tarda Stewart of Coffea canephora
D - Leaf spot Phyllosticta coffeicola Speg. of Coffea arabica
D - Leaf spot Phyllosticta coffeicola Speg. of Coffea canephora
D - Rust Hemileia vastatrix Berk. & Br. of Coffea arabica
D - Rust Hemileia vastatrix Berk. & Br. of Coffea canephora
D - South America leaf spot Mycena citricola of Coffea arabica
B - South America leaf spot of Coffea arabica
D - South America leaf spot Mycena citricola of Coffea canephora
B - South America leaf spot of Coffea canephora
D - Bacterial leaf spot Pseudomonas cichorii of Thunb.

C - Cochliobolus carbonum Nelson Leaf spot
C - Hendersonia theicola Cooke Leaf spot
C - Pestalotiopsis adusta Ellis & Everh. Leaf spot
C - Phaeosphaerella theae Petch Leaf spot
C - Pleospora theae Speschnew Leaf spot
C - Elsinoe theae Bitancourt & Jenkins Leaf scab
C - Phyllosticta erratica Ellis & Everh. Phyllosticta leaf spot
C - Phyllosticta theae Speschnew Phyllosticta leaf spot
C - Phoma theicola Petch Red leaf spot
D - Angular leaf spot Pseudomonas syringae pv. tabaci of Nicotiana tabacum L.
D - Leaf gall Rhodococcus fascians of Nicotiana tabacum L.
D - Frogeye leaf spot Cercospora nicotianae Ellis & Everh. of Nicotiana tabacum L.
D - Phyllosticta leaf spot Phyllosticta nicotiana Ellis & Everh. of Nicotiana tabacum L.
D - Ragged leaf spot Phoma exigua Desmaz. var. exigua of Nicotiana tabacum L.
D - Ragged leaf spot Ascochyta phaseolorum Sacc. of Nicotiana tabacum L.
D - Tobacco leaf curl Tobacco leaf curl virus of Nicotiana tabacum L.
D - Ascochyta leaf blight : C3,C4 Ascochyta a

D - Downy mildew Peronospora farinosa of Beta vulgaris L.
B - Downy mildew of Beta vulgaris L.
B - Downy mildew of Beta vulgaris L.
B - Downy mildew of Beta vulgaris L.
B - Downy mildew of Beta vulgaris L.
D - Fusarium yellows Fusarium oxysporum Schlechtend. of Beta vulgaris L.
B - Fusarium yellows of Beta vulgaris L.
D - Phytophthora wet rot Phytophthora drechsleri Tucker of Beta vulgaris L.
D - Powdery mildew Erysiphe polygoni DC. of Beta vulgaris L.
D - Powdery mildew Erysiphe betae Weltzien of Beta vulgaris L.
D - Rust Uromycesbetae J. Kickx fil. of Beta vulgaris L.
D - Slime molds* Physarum cinereum of Beta vulgaris L.
D - Storage rots Botrytis cinerea Pers. of Beta vulgaris L.
B - Storage rots of Beta vulgaris L.
D - Storage rots Botryotinia fuckeliana of Beta vulgaris L.
D - Storage rots Penicillium spp. of Beta vulgaris L.
D - Storage rots Phoma betae A. B. Frank of Beta vulgaris L.
D - Verticillium wilt Verticillium albo-atrum Reinke & Berthier of Beta vulgaris L.
D - Beet cys

D - Dodder Cuscuta compacta Juss. of Ait.) Pers.
D - Dodder Cuscuta gronovii Willd. of Ait.) Pers.
D - Flooding injury Oxygen deficiency of Ait.) Pers.
D - Purple berry Physiological of Ait.) Pers.
D - Sterile breakdown Physiological of Ait.) Pers.
D - Sun scald Overheating of fruit from direct exposure to sun of Ait.) Pers.
D - Bacterial soft rot Erwinia Erwinia carotovora of Brassica
D - Bacterial soft rot Erwinia Erwinia carotovora of Raphanus
D - Bacterial soft rot Pseudomonas Pseudomonas marginalis pv. marginalis of Brassica
D - Bacterial soft rot Pseudomonas Pseudomonas marginalis pv. marginalis of Raphanus
D - Black rot Xanthomonas campestris pv. campestris of Brassica
D - Black rot Xanthomonas campestris pv. campestris of Raphanus
D - Crown gall Agrobacterium tumefaciens of Brassica
D - Crown gall Agrobacterium tumefaciens of Raphanus
D - Anthracnose Colletotrichum higginsianum Sacc. in Higgins of Brassica
D - Anthracnose Colletotrichum higginsianum Sacc. in Higgins of Raphanus

D - Downy mildew Peronospora chlorae deBary of Raf.) Shinn
D - Fusarium wilt Fusarium oxysporum of Raf.) Shinn
B - Fusarium wilt of Raf.) Shinn
D - Bean yellow mosaic Bean yellow mosaic virus of Raf.) Shinn
D - Cucumber mosaic Cucumber mosaic virus of Raf.) Shinn
D - Impatiens necrotic spot Impatiens necrotic spot virus of Raf.) Shinn
D - Lisianthus necrosis Lisianthus necrosis virus of Raf.) Shinn
D - Tobacco mosaic Tobacco mosaic virus of Raf.) Shinn
D - Bacterial black spot = bacterial canker Xanthomonas campestris pv. mangiferaeindicae of Mangifera indica L.
D - Bacterial black spot = bacterial canker Moniz of Mangifera indica L.
B - Bacterial black spot = bacterial canker of Mangifera indica L.
D - Bacterial black spot = bacterial canker Ribeiro and Kimura 1974 of Mangifera indica L.
D - Crown gall Agrobacterium tumefaciens of Mangifera indica L.
D - Anthracnose Colletotrichum gloeosporioides of Mangifera indica L.
D - Anthracnose Glomerella cingulata of Mangifera indica L.
D - An

D - Pear scab Venturia pirina Aderh. of Pyrus communis L.
D - Pear scab Fusicladium pyrorum of Pyrus communis L.
D - Perennial canker Neofabrae perennans Kienholz of Pyrus communis L.
D - Perennial canker Cryptosporiopsis perennans of Pyrus communis L.
D - Pink mold rot Trichothecium roseum of Pyrus communis L.
B - Pink mold rot of Pyrus communis L.
D - Pink mold rot Cephalothecium roseum Corda of Pyrus communis L.
D - Powdery mildew Podosphaera leucotricha of Pyrus communis L.
D - Pythium dieback Pythium spp. of Pyrus communis L.
D - Rhizopus rot Rhizopus stolonifer of Pyrus communis L.
B - Rhizopus rot of Pyrus communis L.
D - Rust, Kern's pear Gymnosporangium kernianum Bethel of Pyrus communis L.
D - Rust, Pacific Coast pear Gymnosporangium libocedri of Pyrus communis L.
D - Rust, pear trellis Gymnosporangium fuscum R. Hedw. in DC. of Pyrus communis L.
D - Rust, Rocky Mountain pear Gymnosporangium nelsonii Arth. of Pyrus communis L.
D - Side rot Phialophora malorum of Pyrus communis

C - Rhizoctonia solani KÌ_hn Damping-off
C - Thanatephorus cucumeris Damping-off
C - Pythium aphanidermatum Damping-off
C - Pythium debaryanum Auct. non R. Hesse Damping-off
C - Pythium irregulare Buisman Damping-off
C - Pythium myriotylum Drechs. Damping-off
C - Pythium ultimum Trow Damping-off
C - Peronospora manshurica Downy mildew
C - Drechslera glycines Narayanasamy & Durairj Drechslera blight
C - Microsphaera diffusa Cooke & Peck Powdery mildew
C - Pythium aphanidermatum Pythium rot
C - Pythium debaryanum Auct. non R. Hesse Pythium rot
C - Pythium irregulare Buisman Pythium rot
C - Pythium myriotylum Drechs. Pythium rot
C - Pythium ultimum Trow Pythium rot
C - Cylindrocladium crotalariae Red crown rot
C - Calonectria crotalariae Red crown rot
A - Red crown rot
C - Rhizoctonia solani KÌ_hn Rhizoctonia aerial blight
C - Thanatephorus cucumeris Rhizoctonia aerial blight
C - Phakopsora pachyrhizi Syd. Rust*
C - Spaceloma glycines Kurata & Kuribayashi Scab*
C - Fusarium solani Sudden 

B - Tar spot : C3,C4 of Plantae
B - Tar spot : C3,C4 of Plantae
B - Tar spot : C3,C4 of Plantae
B - Tar spot : C3,C4 of Plantae
D - Typhula blight : C3 Typhula incarnata Fr. of Plantae
D - Typhula blight : C3 Typhula ishikariensis Imai var. canadensis J. D. Smith & ÌÉrsvoll of Plantae
D - Typhula blight : C3 Typhula ishikariensis Imai var. idahoensis ÌÉrsvoll & J. D. Smith of Plantae
D - Typhula blight : C3 Typhula ishikariensis Imai var. ishikariensis ÌÉrsvoll & J. D. Smith of Plantae
D - White blight : C3 Melanotus phillipsii of Plantae
D - Yellow patch : C3 Rhizoctonia cerealis Van der Hoeven of Plantae
D - Yellow patch : C3 Ceratobasidium cereale D. Murray & L. L. Burpee [teleomorph] of Plantae
D - Yellow tuft Sclerophthora macrospora of Plantae
B - Yellow tuft of Plantae
B - Yellow tuft of Plantae
D - Yellow ring : C3 Trechispora alnicola of Plantae
D - Yellow ring : C3 Yellow tuft of Plantae
D - Yellow ring : C3 See downy mildew of Plantae
D - Centipedegrass mosaic : E Panicum mo

In [9]:
####  THIS WON"T WORK ANYMORE AS I"VE CHANGED THE FUNCION
    
# print('A - {}\nB - {}\nC - {}\nD - {}'.format(names.count('A'),names.count('B'),names.count('C'),names.count('D')))
# print(len(counts))
# print(df.index.values)

In [10]:
print('twelve = {}\nfourteen = {}\nother = {}'.format(twelve,fourteen, other))

twelve = 6478
fourteen = 5274
other = 0


In [11]:
column_headers = ["disease_name", "source_taxon_verbatim_name", "source_taxon_name" "source_taxon_id", "interaction_type_label", "interaction_type_id", "target_taxon_verbatim_name", "target_taxon_name", "target_taxon_id", "source_citation", "source_url", "source_accessed_at", "plant_part_id", "plant_part_name"]
print(type(column_headers),len(column_headers))


<class 'list'> 13


In [12]:
# df = pd.read_csv(raw_scrape_file, sep='\t', header=None, names=column_headers)
# df1 = pd.read_csv(raw_scrape_file, sep='\t', header=None)
column_headers_plus_index = column_headers = ["index", "disease_name", "source_taxon_verbatim_name", "source_taxon_name" "source_taxon_id", "interaction_type_label", "interaction_type_id", "target_taxon_verbatim_name", "target_taxon_name", "target_taxon_id", "source_citation", "source_url", "source_accessed_at", "plant_part_id", "plant_part_name"]
df = pd.read_csv(raw_scrape_file, sep='\t', header=None, names=column_headers_plus_index)

# df1.columns = column_headers
# print(len(list(df)),'\n\n\n', len(list(df1)))
# pandas uses the first column as an index, so the first one is indexed on "disease_name"
# print(list(df),'\n\n\n', list(df1))


For some reason, the headers are missing on this new dataframe.

Old headers:  ["disease_name", "source_taxon_verbatim_name", "source_taxon_name"	"source_taxon_id", "interaction_type_label", "interaction_type_id", "target_taxon_verbatim_name", "target_taxon_name", "target_taxon_id", "source_citation", "source_url", "source_accessed_at", "plant_part_id", "plant_part_name"]

In [13]:
list(df)
# 
# print(df['disease_name'])

['index',
 'disease_name',
 'source_taxon_verbatim_name',
 'source_taxon_namesource_taxon_id',
 'interaction_type_label',
 'interaction_type_id',
 'target_taxon_verbatim_name',
 'target_taxon_name',
 'target_taxon_id',
 'source_citation',
 'source_url',
 'source_accessed_at',
 'plant_part_id',
 'plant_part_name']

In [14]:
print(df['source_taxon_verbatim_name'])

0              Heavy rains or prolonged post-harvest dips
1                        Extended period of soil flooding
2                         Prunus necrotic ring spot virus
3                                   Tomato ringspot virus
4                                                 Genetic
5                 Pycnostysanus azalea (Peck) E. W. Mason
6                                    Briosia azaleae Peck
7                       Fusarium tricinctum (Corda) Sacc.
8                             Virus for some combinations
9                                                     NaN
10                                                    NaN
11                                           Erwinia spp.
12       Phytophthora palmivora (E.J. Butler) E.J. Butler
13                        Phytophthora heveae A. Thompson
14                       Phytophthora katsurae Ko & Chang
15                  Phytophthora nicotianae Breda de Haan
16                          Fusarium moniliforme J. Sheld
17            

In [15]:
# to check what each of these is:
#'index' 
# 'disease_name'
# 'source_taxon_verbatim_name'
# 'source_taxon_namesource_taxon_id'   --- looks to be pathogen ID
# 'interaction_type_label'
# 'interaction_type_id'
# 'target_taxon_verbatim_name'
# 'target_taxon_name'
# 'target_taxon_id'                    --- looks to be host ID
# 'source_citation'
# 'source_url'
# 'source_accessed_at'
# 'plant_part_id'
# 'plant_part_name'


# print(df["source_taxon_namesource_taxon_id"])
# print(df["target_taxon_id"])
print(df["disease_name"])
# print(df["plant_part_name"])
# print(df["plant_part_name"])

0               Heavy rains or prolonged post-harvest dips
1                         Extended period of soil flooding
2        genus Ilarvirus, Prunus necrotic ring spot vir...
3           genus Nepovirus, Tomato ringspot virus (ToRSV)
4                                                  Genetic
5                  Pycnostysanus azalea (Peck) E. W. Mason
6                              (syn. Briosia azaleae Peck)
7                        Fusarium tricinctum (Corda) Sacc.
8        Virus for some combinations; otherwise, geneti...
9        Virus for some combinations; otherwise, geneti...
10       Virus for some combinations; otherwise, geneti...
11                                            Erwinia spp.
12        Phytophthora palmivora (E.J. Butler) E.J. Butler
13                                   P. heveae A. Thompson
14                                  P. katsurae Ko & Chang
15                             P. nicotianae Breda de Haan
16                           Fusarium moniliforme J. She

# fill in blank plant parts with "whole plant"

Marie's script only filled in 'plant_part_id' and 'plant_part_name' for rows where a plant part could be found in the name/row.  This left a lot of blanks (NaN) in the pandas dataframe.  These need to be filled to prevent errors down the road.
# RESOLVED
## issue:  The plant parts that MAL filled in the 'plant_part_name' column came back as the synonym:  That's why you see the parenthesis after the actual name (eg: 'climbing root (narrow)')

In [16]:
df.plant_part_id.fillna('http://purl.obolibrary.org/obo/PO_0000003', inplace=True)
df.plant_part_name.fillna('whole plant', inplace=True)

#check that it worked:
len(df.columns)

14

In [17]:
# print the plant part, and it's ID to make sure the NaNs got filled and the synonym issue was resolved.  (it was.)
for index, row in df.iterrows():
#     print(row['plant_part_id'])    
    print(row['plant_part_name'], row['plant_part_id'])

lenticel http://purl.obolibrary.org/obo/PO_0000031
lenticel http://purl.obolibrary.org/obo/PO_0000031
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
branch http://purl.obolibrary.org/obo/PO_0025073
branch http://purl.obolibrary.org/obo/PO_0025073
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.obolibrary.org/obo/PO_0000055
bud http://purl.ob

root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://p

root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://p

root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://purl.obolibrary.org/obo/PO_0009005
root http://p

leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://p

leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://purl.obolibrary.org/obo/PO_0025034
leaf http://p

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
capitulum inflorescence http://purl.obolibrary.org/obo/PO_0030121
capitulum inflorescence http://purl.obolibrary.org/obo/PO_0030121
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
trichome http://purl.obolibrary.org/obo/PO_0000282
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
follicle fruit http://purl.obolibrary.org/obo/PO_0030105
follicle frui

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
ear inflorescence http://purl.obolibrary.org/obo/PO_0020136
ear inflorescence http://purl.obolibrary.org/obo/PO_0020136
flower fascicle http://purl.obolibrary.org/obo/PO_0025323
flower fascicle http://purl.obolibrary.org/obo/PO_0025323
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
ear inflorescence http://purl.obolibrary.org/obo/PO_0020136
ear inflorescence http://purl.obolibrary.org/obo/PO_0020

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obolibrary.org/obo/PO_0000003
whole plant http://purl.obol

# Creating a new DataFrame using only the columns I need for patternapply.py



The data must be in a specific format for digestion using patternapply.py.  So the data must be massaged using python's pandas.

## Steps:

1. Make a a copy of the original dataframe to avoid confusion

2. Remove all columns not needed (keep disease_name to feed disease_name_formatter())

3. Add additional columns needed by 'pattern_apply.py'  (iri, and iri label)

4. Rename all column headers to comply to pattern

5. Remove all rows with missing data (maybe move them to new 'rejects.tsv' for later use)

6. write remaining df to file

7. run pattern_apply.py on it.


## desired header order:
iri, iri label, pathogen, pathogen label, host, host label, plantstructure, plantstructure label

 [iri (OOPS:xxx), iri_label (disease common name), pathogen (NCBITaxon:xxxxxx), pathogen_label ( pathogen name ), host (NCBITaxon:xxxxxx), host_label (host common name), plantstructure (PO:xxx), plantstructure_label (common name)]


In [18]:
print(list(df))


['index', 'disease_name', 'source_taxon_verbatim_name', 'source_taxon_namesource_taxon_id', 'interaction_type_label', 'interaction_type_id', 'target_taxon_verbatim_name', 'target_taxon_name', 'target_taxon_id', 'source_citation', 'source_url', 'source_accessed_at', 'plant_part_id', 'plant_part_name']


In [19]:
# start with a copy of the original dataframe.
patterndf = df.copy()

# remove unused columns
patterndf.drop("interaction_type_label", axis=1, inplace=True)
patterndf.drop("interaction_type_id", axis=1, inplace=True)
# patterndf.drop("disease_name", axis=1, inplace=True)
patterndf.drop("source_citation", axis=1, inplace=True)
patterndf.drop("source_url", axis=1, inplace=True)
patterndf.drop("source_accessed_at", axis=1, inplace=True)
patterndf.drop("target_taxon_verbatim_name", axis=1, inplace=True)

print(list(patterndf))

['index', 'disease_name', 'source_taxon_verbatim_name', 'source_taxon_namesource_taxon_id', 'target_taxon_name', 'target_taxon_id', 'plant_part_id', 'plant_part_name']


In [20]:
# add iri and iri_name columns
patterndf['iri'] = 'OOPS:xxxxxxx'  # in need of a system to assign ID numbers...
# patterndf['iri_label'] = ''  # we will keep the original disease name here for now.
patterndf.rename(columns = {'disease_name':'iri_label'}, inplace = True)


# rename column headers to accurately reflect the patternapply needs
patterndf.rename(columns = {'source_taxon_verbatim_name':'pathogen_label'}, inplace = True)
patterndf.rename(columns = {'source_taxon_namesource_taxon_id':'pathogen'}, inplace = True)
patterndf.rename(columns = {'target_taxon_id':'host'}, inplace = True)
patterndf.rename(columns = {'target_taxon_name':'host_label'}, inplace = True)
patterndf.rename(columns = {'plant_part_id':'plantstructure'}, inplace = True)
patterndf.rename(columns = {'plant_part_name':'plantstructure_label'}, inplace = True)






# print(patterndf['target_taxon_verbatim_name'])

print(list(patterndf))


['index', 'iri_label', 'pathogen_label', 'pathogen', 'host_label', 'host', 'plantstructure', 'plantstructure_label', 'iri']


In [21]:
## reorder the columns
# desired order iri	iri label	pathogen	pathogen label	host	host label	plantstructure	plantstructure label
desired_order = ['iri', 'iri_label','pathogen','pathogen_label','host','host_label','plantstructure','plantstructure_label']

patterndf = patterndf[desired_order]

print(list(patterndf))

['iri', 'iri_label', 'pathogen', 'pathogen_label', 'host', 'host_label', 'plantstructure', 'plantstructure_label']


# to do: Tuesday

1. Move all rows with a "no:match" in either pathogen or host column into a "reject.df"
2. add the raw name column back
2a. run the df through the renamer, and replace bad names
2b. re-write the .tsv file
3. run patternapply.py
4. migrate the new df to the Planteome inca-form (https://github.com/Planteome/planteome-inca-form)
5. run all apply pattern scripts, and assemble OOPS.
6. show bossman, ask for fixes



100. Write a paper about the new stress ontology!

In [22]:
# this is a reset-  if you accidently modify the dataframe in place, and need it back, you can revert to the OG_df.  
# patterndf = OG_df.copy()

In [23]:
# 1. Move all rows with a "no:match" in either pathogen or host column into a "reject.df"
#OG df = patterndf
# There are two ways to do this:  we will time them, just for fun. # the query one didn't work.

OG_df = patterndf.copy()

# patterndf.query('pathogen_label != no:match')   #this didn't work, but seems powerful.
no_pathogen_df = patterndf[patterndf['pathogen'] == 'no:match']
patterndf = patterndf[patterndf['pathogen'] != 'no:match']

print('Rows with pathogen: {}\nRows without: {}'.format(patterndf.shape[0],no_pathogen_df.shape[0]))

Rows with pathogen: 7030
Rows without: 4722


In [24]:
# for some reason, if the pathogen was not found, pathogen_label got the "no:match",
# BUUUUUUT... if the host wasn't found, it was 'host' that got "no:match", not 'host_label'

############# THE ABOVE IS INCORRECT NOW.  I MADE A MISTAKE IN NAMING THE COLUMNS!!! ###

no_host_df = patterndf[patterndf['host'] == 'no:match']
patterndf = patterndf[patterndf['host'] != 'no:match']

print('Rows with host: {}\nRows without: {}'.format(patterndf.shape[0],no_host_df.shape[0]))
print('origianl number of rows: {}'.format(OG_df.shape[0]))

Rows with host: 6101
Rows without: 929
origianl number of rows: 11752


## Results:

The original data set was 11752 rows (diseases)
4722 of them did not have a pathogen match

# 6101 diseases!

Write the resulting dataframe as a TSV.

In [None]:
# write a .tsv with the new dataframe:  Don't need to do this here anymore.  See next cell
# patterndf.to_csv(path_or_buf='APS_Scrape_to_DP.tsv', sep='\t', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.')

In [25]:
## 2. add the raw name column back -  Done.  Just kept original raw name as 'iri_label'
#  2a. run the df through the renamer, and replace bad names 

for index, row in patterndf.iterrows():
    row['iri_label'] = disease_name_formatter(row['iri_label'],row['host_label'], row['pathogen_label'],row['plantstructure_label'])

#  2b. re-write the .tsv file
patterndf.to_csv(path_or_buf='APS_Scrape_to_DP.tsv', sep='\t', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.')

B - Fusarium tricinctum of Dianthus caryophylium L.
B - Erwinia spp. of Cocos nucifera L.
B - Phytophthora palmivora of Cocos nucifera L.
D - P. heveae A. Thompson Phytophthora heveae A. Thompson of Cocos nucifera L.
D - P. katsurae Ko & Chang Phytophthora katsurae Ko & Chang of Cocos nucifera L.
D - P. nicotianae Breda de Haan Phytophthora nicotianae Breda de Haan of Cocos nucifera L.
D - F. solani Fusarium solani of Cocos nucifera L.
B - Graphium sp. of Cocos nucifera L.
B - Not known, but possibly vectored by the insects Sogatella kolophon and S. yubana of Cocos nucifera L.
B - genus Nepovirus, Tomato ringspot virus of Prunus persica
B - genus Nepovirus, Tomato ringspot virus of Prunus persica var. nucipersica
B - Tomato spotted wilt virus of Arachis hypogaea L.
B - Pseudomonas syringae pv. syringae van Hall of Prunus avium
B - Pseudomonas syringae pv. syringae van Hall of Prunus cerasus
B - Pleurotus ostreatus of Pistacia vera L.
B - Schizophyllum commune Fr.:Fr. of Pistacia vera L

B - Meloidogyne arenaria of Prunus armeniaca L.
D - M. incognita Meloidogyne incognita of Prunus armeniaca L.
D - M. javanica Meloidogyne javanica of Prunus armeniaca L.
B - Fusarium oxysporum of Asparagus officinalis
D - F. redolens Wollenw. Fusarium redolens Wollenw. of Asparagus officinalis
B - Gibberella fujikuroi of Asparagus officinalis
B - Gibberella fujikuroi of Asparagus officinalis
B - Zopfia rhizophila Rabenhorst of Asparagus officinalis
B - Armillaria mellea of Persea americana Miller
B - Armillaria tabescens of Persea americana Miller
B - Rhizoctonia solani KÌ_hn of Persea americana Miller
B - Thanatephorus cucumeris of Persea americana Miller
B - Fusarium sp. of Persea americana Miller
B - Pythium sp. of Persea americana Miller
B - Rosellinia bunodes of Persea americana Miller
B - Armillaria mellea of Rhododendron spp.
B - Armillaria tabescens of Rhododendron spp.
B - Cylindrocladium floridanum Sobers & C. P. Seym. of Rhododendron spp.
D -  Calonectria kyotensis Terash. o

B - Pythium spp. of Helianthus annuus
B - Pythium spp. of Helianthus tuberosus
D - P. aphanidermatum Pythium aphanidermatum of Helianthus annuus
D - P. aphanidermatum Pythium aphanidermatum of Helianthus tuberosus
D - P. debaryanum Auct. non R. Hesse Pythium debaryanum Auct. non R. Hesse of Helianthus annuus
D - P. debaryanum Auct. non R. Hesse Pythium debaryanum Auct. non R. Hesse of Helianthus tuberosus
D - P. irregulare Buissman Pythium irregulare Buissman of Helianthus annuus
D - P. irregulare Buissman Pythium irregulare Buissman of Helianthus tuberosus
D -  Athelia rolfsii of Helianthus annuus
D -  Athelia rolfsii of Helianthus tuberosus
B - Meloidogyne arenaria of Helianthus annuus
B - Meloidogyne arenaria of Helianthus tuberosus
D - M. incognita Meloidogyne incognita of Helianthus annuus
D - M. incognita Meloidogyne incognita of Helianthus tuberosus
D - M. javanica Meloidogyne javanica of Helianthus annuus
D - M. javanica Meloidogyne javanica of Helianthus tuberosus
B - Armillar

AttributeError: 'float' object has no attribute 'split'

In [None]:
# elif host in short_name:
#      24         new_name = '{pathogen} {aps_name}'.format(pathogen=pathogenname,aps_name=short_name)
#      25         print('C -', new_name)

# TypeError: 'in <string>' requires string as left operand, not float

# got the above errror-  this means some of the host names are type:float... this is bad, so I need to do more filtering on the dataframe
# filter the *_label columns for non-strings.
# df3 = patterndf[isinstance(patterndf['host_label'], str)]
for index, row in patterndf.iterrows():
    x = isinstance(row['host_label'], str)
    if not x:
        print(index, row['host_label'], row['host'])
#     print( x)
#     print(type(row['host_label']))
# print(df3.shape,patterndf.shape)
print(type(patterndf['host_label']))
# isinstance(obj_to_test, str)

In [None]:
patterndf.isnull().sum()

In [None]:
print(patterndf['iri_label'])

In [None]:
patterndf.to_csv(path_or_buf='APS_Scrape_to_DP.tsv', sep='\t', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.')