## Packages

In [2]:
# Import all necesssary modules
import pandas as pd
import swifter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import re
import os
from ete3 import NCBITaxa
from pprint import pprint
from tqdm.notebook import tqdm_notebook, tqdm
# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)
from zoonosis_helper_functions import *

### Configure Progress bar and Modin Pandas Engine

In [3]:
tqdm.pandas(desc='Processing')
os.environ["MODIN_ENGINE"] = "ray"

## Data Processing

In [4]:
# Load dataset downloaded from Uniprot
df = pd.read_table('../data/uniprot-keyword Virus+entry+into+host+cell+[KW-1160] +fragment no.tab.gz')

In [5]:
df.shape

(358333, 9)

In [6]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
187842,A0A172S793,A0A172S793_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/Alaska/96/2015(H3N2)),498,1823807,Influenza A virus,
306927,A0A3G5PFR5,A0A3G5PFR5_9HIV1,unreviewed,Envelope glycoprotein gp160 (Env polyprotein) ...,Human immunodeficiency virus 1,867,11676,Human immunodeficiency virus 1,Homo sapiens (Human) [TaxID: 9606]
319781,Q3ZJ94,Q3ZJ94_9HIV1,unreviewed,Envelope glycoprotein gp160 (Env polyprotein) ...,Human immunodeficiency virus 1,861,11676,Human immunodeficiency virus 1,Homo sapiens (Human) [TaxID: 9606]


In [7]:
# Check for missing host names
print(df[df['Virus hosts'].isnull()].shape)
df[df['Virus hosts'].isnull()].sample(3)

(237573, 9)


Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
214496,A0A0U4TXV9,A0A0U4TXV9_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/swine/Manitoba/D0094/2012...,498,1761567,Influenza A virus,
118468,C6KMU6,C6KMU6_9INFA,unreviewed,Hemagglutinin [Cleaved into: Hemagglutinin HA1...,Influenza A virus (A/Shiga/2/2009(H1N1)),566,653950,Influenza A virus,
53664,A0A514CZT4,A0A514CZT4_9VIRU,unreviewed,Uncharacterized protein,Leviviridae sp.,422,2027243,Leviviridae sp.,


In [8]:
df[df['Status'] == 'reviewed']['Organism'].nunique()

1518

In [9]:
df[df['Status'] == 'reviewed']['Virus hosts'].nunique()

321

In [10]:
df['Organism'].nunique()

100216

In [11]:
df['Virus hosts'].nunique()

373

In [12]:
df['Taxonomic lineage IDs'].nunique()

100216

In [13]:
df[df['Status'] == 'unreviewed']['Taxonomic lineage IDs'].nunique()

99095

In [14]:
df[df['Status'] == 'unreviewed']['Organism'].nunique()

99095

In [15]:
df[df['Status'] == 'unreviewed']['Virus hosts'].nunique()

200

In [16]:
## Replace N/A columns with an empty string... prevents errors with column wide string operations

df['Virus hosts'] = np.where(df['Virus hosts'].isnull(), '',df['Virus hosts'])

In [17]:
df.sample(5)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
79672,A0A2K9VMX7,A0A2K9VMX7_9CAUD,unreviewed,Tail sheath protein,Shigella phage Sf21,659,2024308,Shigella virus Sf21,
115923,A0A097PGT4,A0A097PGT4_9INFA,unreviewed,Hemagglutinin [Cleaved into: Hemagglutinin HA1...,Influenza A virus (A/Virginia/13/2014(H3)),566,1561675,Influenza A virus,
91741,C7DN73,C7DN73_HBV,unreviewed,Capsid protein (Core antigen) (Core protein) (...,Hepatitis B virus (HBV),212,10407,Hepatitis B virus (HBV),Homo sapiens (Human) [TaxID: 9606]; Pan troglo...
223671,Q0A2A6,Q0A2A6_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/ruddy turnstone/Delaware/...,498,402531,Influenza A virus,
327064,A0A410SW33,A0A410SW33_9HIV1,unreviewed,Envelope glycoprotein gp160 (Env polyprotein) ...,Human immunodeficiency virus 1,863,11676,Human immunodeficiency virus 1,Homo sapiens (Human) [TaxID: 9606]


In [18]:
def join_names(df, col_name: str):
    df[col_name] = df[col_name].str.split('; ').apply(set).apply('; '.join)
    return df

In [19]:
# df['Virus hosts'] = df['Virus hosts'].str.split('; ')
# df['Virus hosts'] = df['Virus hosts'].swifter.progress_bar(enable=True, desc='Removing duplicate host names').apply(set)
# df['Virus hosts'] = df['Virus hosts'].swifter.progress_bar(enable=True, desc='Joining host names list').apply('; '.join)

# df['Protein names'] = df['Protein names'].str.split('; ')
# df['Protein names'] = df['Protein names'].swifter.progress_bar(enable=True, desc='Removing duplicate protein names').apply(set)
# df['Protein names'] = df['Protein names'].swifter.progress_bar(enable=True, desc='Joining protein names list').apply('; '.join)

# df['Organism'] = df['Organism'].str.split('; ')
# df['Organism'] = df['Organism'].swifter.progress_bar(enable=True, desc='Removing duplicate organism names').apply(set)
# df['Organism'] = df['Organism'].swifter.progress_bar(enable=True, desc='Joining organism names list').apply('; '.join)

In [20]:
df = join_names(df, 'Virus hosts')
df = join_names(df, 'Protein names')
df = join_names(df, 'Organism')

In [21]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
163789,X2EPX7,X2EPX7_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/Chicago/YGA_04110/2012(H3...,566,1427810,Influenza A virus,
190295,Q10TT6,Q10TT6_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/Wellington/80/2002(H3N2)),498,383179,Influenza A virus,
162924,F8IX17,F8IX17_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/peaceful dove/Denpasar/BB...,568,1042013,Influenza A virus,


In [22]:
df[~df['Virus hosts'].isnull()].sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
258631,A0A2H4ZAX9,A0A2H4ZAX9_9VIRU,unreviewed,Pre-glycoprotein polyprotein GP complex (Pre-G...,Wenzhou mammarenavirus,492,1588685,Wenzhou mammarenavirus,
236,A0A1D7SGR6,A0A1D7SGR6_9CAUD,unreviewed,Portal protein (gp20),Cyanophage S-RIM44,559,1278485,Synechococcus virus SRIM44,
267479,A0A0G2T8B4,A0A0G2T8B4_HCMV,unreviewed,Envelope glycoprotein H (gH),Human cytomegalovirus (HHV-5) (Human herpesvir...,743,10359,Human cytomegalovirus (HHV-5) (Human herpesvir...,Homo sapiens (Human) [TaxID: 9606]


In [23]:
# Species ID from organism ID
df['Species taxonomic ID'] = (df['Taxonomic lineage IDs']
                              .swifter.progress_bar(enable=True, desc='Getting Viruses taxonomic IDs')
                              .apply(getRankID, rank='species'))

Getting Viruses taxonomic IDs:   0%|          | 0/16 [00:00<?, ?it/s]

In [24]:
dff = df[['Entry', 'Species taxonomic ID']].copy()

In [25]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID
349607,A0A366YIF7,A0A366YIF7_ECOLX,unreviewed,DUF4102 domain-containing protein (Tyrosine-ty...,Escherichia coli,400,562,Escherichia coli,,562.0
239608,A0A0C4ZQY7,A0A0C4ZQY7_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/chicken/Dongguan/3917/201...,560,1593618,Influenza A virus,,11320.0
261057,F0U0G1,F0U0G1_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/swine/Hong Kong/248/2009(...,566,991632,Influenza A virus,,11320.0


In [26]:
df[df['Species taxonomic ID'].isnull()]

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID
11119,O41892,O41892_PEGIA,unreviewed,Genome polyprotein (EC 3.6.4.13),Pegivirus A (isolate Saguinus labiatus/-/GBV-A...,2967,1554494,Pegivirus A,Leontocebus nigricollis (Black-mantled tamarin...,
18609,Q0NCQ7,Q0NCQ7_VAR65,unreviewed,Protein G3,Variola virus (isolate Human/South Africa/102/...,111,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
30670,Q0NG66,Q0NG66_VAR46,unreviewed,Myristylprotein,Variola virus (isolate Human/Japan/Yamada MS-2...,340,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],
84779,Q0NCI5,Q0NCI5_VAR65,unreviewed,Fusion protein,Variola virus (isolate Human/South Africa/102/...,146,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
84815,Q0NCJ6,Q0NCJ6_VAR65,unreviewed,Virion membrane protein A21,Variola virus (isolate Human/South Africa/102/...,117,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
87500,B2BX77,B2BX77_ASPVR,unreviewed,Hemagglutinin-neuraminidase protein,Atlantic salmon paramyxovirus (isolate -/Norwa...,576,1283346,Salmon aquaparamyxovirus,Salmo salar (Atlantic salmon) [TaxID: 8030],
98638,Q0NCN6,Q0NCN6_VAR65,unreviewed,IMV membrane protein,Variola virus (isolate Human/South Africa/102/...,189,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
99867,Q0NCP9,Q0NCP9_VAR65,unreviewed,Myristylprotein,Variola virus (isolate Human/South Africa/102/...,340,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
115389,Q0NG61,Q0NG61_VAR46,unreviewed,Protein L5,Variola virus (isolate Human/Japan/Yamada MS-2...,128,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],
116649,Q0NG40,Q0NG40_VAR46,unreviewed,Carbonic anhydrase homolog (Cell surface-bindi...,Variola virus (isolate Human/Japan/Yamada MS-2...,304,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],


In [27]:
# Get the species name of the earlier unidentified taxonomic IDs
idx_species_name = df.columns.get_loc('Taxonomic lineage (SPECIES)')
idx_organism_id = df.columns.get_loc('Species taxonomic ID')

for row in tqdm_notebook(range(len(df)), desc='Getting species ID from organism name'):
    if np.isnan(df.iat[row, idx_organism_id]):
        df.iat[row, idx_organism_id] = getIDfromName(df.iat[row, idx_species_name])

Getting species ID from organism name:   0%|          | 0/358333 [00:00<?, ?it/s]

In [28]:
df[df['Species taxonomic ID'].isnull()]

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID


In [29]:
df['Species taxonomic ID'] = df['Species taxonomic ID'].apply(int)

In [30]:
df.shape

(358333, 10)

In [31]:
df = (df.drop(['Status','Taxonomic lineage IDs'], axis=1)
      .groupby('Species taxonomic ID', as_index=False)
      .agg({'Virus hosts':set, 'Organism':set,
            'Protein names':set, 'Taxonomic lineage (SPECIES)':'first'}))

In [32]:
df['Virus hosts'] = df['Virus hosts'].str.join('; ')
df['Organism'] = df['Organism'].str.join('; ')
df['Protein names'] = df['Protein names'].str.join('; ')

In [33]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Taxonomic lineage (SPECIES)
9958,2060955,,Staphylococcus phage phiSa2wa_st80,Core-binding (CB) domain-containing protein; P...,Staphylococcus phage phiSa2wa_st80
4281,1134435,,Thauera humireducens,Alpha/beta hydrolase; Integrase; Uncharacteriz...,Thauera humireducens
13487,2652404,,Mycobacterium phage Nitzel,Integrase,Mycobacterium phage Nitzel
7999,1920123,,Sphingomonas sp. S-NIH.Pt1_0416,Integrase; DUF4102 domain-containing protein,Sphingomonas sp. S-NIH.Pt1_0416
6097,1647534,,Moraxella phage Mcat19,Integrase; Portal protein,Moraxella phage Mcat19


In [34]:
df.shape

(15109, 5)

In [35]:
df['Species name'] = (df.drop('Taxonomic lineage (SPECIES)', axis=1)
                      .swifter.progress_bar(enable=True, desc='Getting Species name')
                      .apply(lambda x: getRankName(x['Species taxonomic ID'], 
                                                   rank='species'), axis=1))

[2m[33m(pid=raylet)[0m E0211 22:02:52.214976912   19236 socket_utils_common_posix.cc:224] check for SO_REUSEPORT: {"created":"@1644609772.214956520","description":"SO_REUSEPORT unavailable on compiling system","file":"src/core/lib/iomgr/socket_utils_common_posix.cc","file_line":193}


In [36]:
df['Species superkingdom'] = df['Species taxonomic ID'].progress_apply(getRankName, rank='superkingdom')

Processing:   0%|          | 0/15109 [00:00<?, ?it/s]

In [37]:
df['Species family'] = df['Species taxonomic ID'].progress_apply(getRankName, rank='family')

Processing:   0%|          | 0/15109 [00:00<?, ?it/s]

In [38]:
df['Species superkingdom'].unique()

array(['Bacteria', 'Archaea', 'Eukaryota', 'Viruses', 'IncJ plasmid R391',
       'uncultured organism', 'metagenome', 'Plasmid pFKY1',
       'human gut metagenome', 'marine metagenome',
       'mine drainage metagenome', 'marine sediment metagenome',
       'freshwater metagenome',
       'uncultured marine microorganism HF4000_005I08',
       'wastewater metagenome', 'hydrothermal vent metagenome',
       'sediment metagenome', 'viral metagenome', 'biofilter metagenome',
       'bioreactor metagenome', 'anaerobic digester metagenome',
       'plant metagenome', 'invertebrate metagenome'], dtype=object)

In [39]:
df = df[df['Species superkingdom'] == 'Viruses']

In [40]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Taxonomic lineage (SPECIES),Species name,Species superkingdom,Species family
7597,1891774,Saimiri boliviensis boliviensis (Bolivian squi...,Squirrel monkey polyomavirus,Minor capsid protein VP2 (Minor structural pro...,Saimiri boliviensis polyomavirus 1,Saimiri boliviensis polyomavirus 1,Viruses,Polyomaviridae
14964,2750850,,Escherichia phage vB_EcoM_Lutter,Tail sheath monomer; Portal protein (gp20); Lo...,Escherichia phage vB_EcoM_Lutter,Escherichia phage vB_EcoM_Lutter,Viruses,Myoviridae
2293,280701,,Pseudomonas virus F116,Integrase,Pseudomonas virus F116,Pseudomonas virus F116,Viruses,Podoviridae
1944,199587,,Hollyhock leaf crumple virus-[Cairo],Capsid protein (Coat protein),Hollyhock leaf crumple virus,Hollyhock leaf crumple virus,Viruses,Geminiviridae
12358,2560280,,Acinetobacter phage vB_ApiP_P2,Head-to-tail joining protein,Acinetobacter virus P2,Acinetobacter virus P2,Viruses,Autographiviridae


In [41]:
df.drop(['Taxonomic lineage (SPECIES)'], axis=1, inplace=True)

In [42]:
df['Virus hosts'] = np.where(df['Virus hosts']=='', np.nan, df['Virus hosts'])

In [43]:
df[df['Virus hosts'].isnull()].sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Species name,Species superkingdom,Species family
12550,2560734,,Salmonella phage STP4-a,Tail sheath protein; Portal protein (gp20); Lo...,Salmonella virus STP4a,Viruses,Myoviridae
14118,2713246,,Bifidobacterium phage BigBern1,Integrase,Bifidobacterium phage BigBern1,Viruses,Siphoviridae
13121,2591403,,Achromobacter phage Motura,Tail connector protein; Long tail fiber protei...,Achromobacter phage Motura,Viruses,Myoviridae
8990,1993867,,Mycobacterium phage Trixie,Integrase,Mycobacterium virus Trixie,Viruses,Siphoviridae
11372,2461378,,Human adenovirus 86,Hexon protein (CP-H) (Protein II); Fiber; Hexo...,Human adenovirus 86,Viruses,Adenoviridae


In [44]:
df.drop('Organism', axis=1, inplace=True)

In [45]:
# List of viruses which do not have assigned hosts in the data
noHostViruses = df[df['Virus hosts'].isnull()]['Species name'].unique().tolist()

In [46]:
# Create independent dataframe of viruses with no assigned host and simltaneously identify the same viruses from the data 
# whcih already have assigned hosts and assign host names based on those. 
df_na_hosts = df[(~df['Virus hosts'].isnull()) & (df['Species name'].isin(noHostViruses))][['Species name', 'Virus hosts']]
df_na_hosts = df_na_hosts.groupby('Species name')['Virus hosts'].apply(list)
df_na_hosts = df_na_hosts.reset_index(name='Viral hosts nw')

In [47]:
# # Previous code reurns a list for multiple host so this code melts the lists into regular string entries
df_na_hosts['Viral hosts nw'] = (df_na_hosts['Viral hosts nw']
                                 .swifter.progress_bar(desc='Joining host names list', enable=True)
                                 .apply('; '.join))

In [48]:
# # updates the viruses hosts
df_naa = (df[df['Virus hosts'].isnull()]
          .merge(df_na_hosts, on='Species name', how='left')
          .drop('Virus hosts', axis=1)
          .rename({'Viral hosts nw':'Virus hosts'}, axis=1))

In [49]:
# Creates independant dataset with viruses which have hosts
df_notna = df[~df['Virus hosts'].isnull()]

In [50]:
# merges the updated virus hosts dataset with the dataset with viruses which have hosts
df = df_naa.append(df_notna)

In [51]:
df.shape

(8113, 6)

In [52]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
6762,2732027,Portal (Connector) protein,Salmonella virus S124,Viruses,Demerecviridae,
5226,2548226,Integrase,Streptococcus phage Javan512,Viruses,Siphoviridae,
3623,2041500,Portal protein,Streptococcus phage A0,Viruses,Siphoviridae,
1844,1562064,Genome polyprotein (EC 3.4.22.28) (P1C) (P1D) ...,Norway rat hunnivirus,Viruses,Picornaviridae,
6841,2733352,Collar protein,Pseudomonas virus DL62,Viruses,Autographiviridae,


In [53]:
print(df[df['Virus hosts'].isnull()].shape)
df[df['Virus hosts'].isnull()].sample(3)

(7512, 6)


Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
2627,1913111,Gene transfer agent portal protein,Pseudoalteromonas phage PHS3,Viruses,Siphoviridae,
3775,2060136,Major capsid protein L1; Minor capsid protein L2,Human papillomavirus type 212,Viruses,Papillomaviridae,
401,255431,Putative portal protein,Klebsiella phage phiKO2,Viruses,Siphoviridae,


In [54]:
df['Virus hosts'] = np.where(df['Virus hosts'].isnull(), '',df['Virus hosts'])

In [55]:
df.sample(3)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
2565,1897751,Integrase; Tape measure protein (TMP); Portal ...,Vibrio phage pVa-1,Viruses,Siphoviridae,
2493,329853,Maturation protein A (MP) (Assembly protein) (...,Escherichia virus BZ13,Viruses,Leviviridae,Escherichia coli [TaxID: 562]
59,39103,Integrase; Portal protein,Lactobacillus phage PL-1,Viruses,Siphoviridae,


In [56]:
df = mergeRows(df, 'Species taxonomic ID','Virus hosts')

In [57]:
df[(df['Species name'].str.contains('Influenza A virus')) & (df['Virus hosts'] != '')]

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family
161,11320,; Sus scrofa (Pig) [TaxID: 9823]; Homo sapiens...,Nucleoprotein (Nucleocapsid protein) (Protein ...,Influenza A virus,Viruses,Orthomyxoviridae


In [58]:
df.sample(3)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family
5712,2548091,,Integrase,Streptococcus phage Javan278,Viruses,Siphoviridae
2142,1458842,,Portal protein,Geobacillus phage GBK2,Viruses,Siphoviridae
933,338079,; Homo sapiens (Human) [TaxID: 9606]; Chloroce...,p3]; Gag polyprotein (Pr71Gag) [Cleaved into: ...,African green monkey simian foamy virus,Viruses,Retroviridae


In [59]:
dfna = df[df['Virus hosts'] == '']
df = df[~(df['Virus hosts'] == '')]

In [60]:
dfna.shape

(7512, 6)

In [61]:
df.shape

(601, 6)

## Updating host names from external sources

In [62]:
df2 = pd.read_csv('../data/sequences.csv')
df2.shape

(2599675, 3)

In [63]:
df2.sample(2)

Unnamed: 0,Species,Molecule_type,Host
1048233,Human immunodeficiency virus 1,ssRNA-RT,Homo sapiens
2183509,Rotavirus A,dsRNA,Equus caballus


In [64]:
df2.drop_duplicates(inplace=True)
df2.shape

(10956, 3)

In [65]:
df2['Species ID'], df2['Host ID'] = df2['Species'].progress_apply(getIDfromName), df2['Host'].progress_apply(getIDfromName)

Processing:   0%|          | 0/10956 [00:00<?, ?it/s]

'Ungulate copiparvovirus 5'
'Feline pegivirus JP03-2471'
'Feline pegivirus JP03-3208'
'Mamastrovirus HMU-1'
'Hedgehog coronavirus'
'Cingulatid gammaherpesvirus 1'
'Mammarenavirus AnRB3214'
'Bat SARS-like coronavirus Khosta-1'
'Bat SARS-like coronavirus Khosta-2'
'Ungulate copiparvovirus 5'
'Torque teno mustelid virus 2'
'Feline stool-associated circular virus'
'Jingmen Rhinolophus sinicus hepacivirus 1'
'Wenzhou Apodemus agrarius hepacivirus 1'
'Longquan Rhinolophus sinicus hepacivirus 1'
'Longquan Niviventer niviventer hepacivirus 1'
'Longquan Niviventer fulvescens hepacivirus 1'
'Wenzhou Suncus murinus hepacivirus 1'
'Wufeng Rhinolophus sinicus hepacivirus 1'
'Wufeng Niviventer niviventer hepacivirus 1'
'Wufeng Niviventer fulvescens hepacivirus 1'
'Wenzhou Rattus norvegicus pegivirus 1'
'Wenzhou Rattus tanezumi pegivirus 1'
'Longquan Rhinolophus pearsonii pegivirus 1'
'Longquan Rhinolophus sinicus pegivirus 1'
'Longquan Niviventer niviventer pegivirus 1'
'Longquan Niviventer fulvesce

Processing:   0%|          | 0/10956 [00:00<?, ?it/s]

'Bolomys lasiurus'
'Bolomys lasiurus'
'Pipistrellus sp. pipistrellus/pygmaeus AO-2021'
'Pipistrellus musciculus'
'Funisciurus bayonii'
'Rattus sp. r3 YH-2020'
'Rattus sp. r3 YH-2020'
'Soricidae sp. YH-2020'
'Rattus sp. r3 YH-2020'
'Rattus sp. r3 YH-2020'
'Acomys selousi'
'Rhinolophus smithersi'
'Alouatta sp.'
'Pipistrellys abramus'
'Sturnira angeli'
'Sturnira angeli'
'Hipposideros curtus'
'Pipistrellus inexspectatus'
'Dobsonia exoleta'
'Mops demonstrator'
'Pipistrellus musciculus'
'Mus sp. TG-2020'
'Murinae gen. sp. TG-2020'
'Vespadelus baverstocki'
'Ozimops sp. DP-2019'
'Scoterepens balstoni'
'Neoromicia capensis'
'Neoromicia capensis'
'Mus sp. CL-2019'
'Mus sp. CL-2019'
'Neoromicia capensis'
'Bolomys lasiurus'
'Bolomys lasiurus'
'Bolomys lasiurus'
'Neoromicia capensis'
'Neoromicia capensis'
'Neoromicia capensis'
'Neoromicia capensis'
'Bolomys lasiurus'
'Pipistrellus inexspectatus'
'Chiroptera sp.'
'Chaerephon aloysiisabaudiae'
'Chiroptera sp.'
'Paradoxurus musangus'
'Neoromicia capen

In [66]:
df2.dropna(inplace=True)
df2['Species ID'], df2['Host ID'] = df2['Species ID'].astype(int), df2['Host ID'].astype(int)
df2.shape

(10802, 5)

In [67]:
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['Host'], x['Host ID']), axis=1)
# Remove Host and Host ID columns as they have been merged and are no longer needed
df2.drop(['Host', 'Host ID'], axis=1, inplace=True)

Processing:   0%|          | 0/10802 [00:00<?, ?it/s]

In [68]:
df2['Species ID'] = df2['Species ID'].progress_apply(getRankID, rank='species')

Processing:   0%|          | 0/10802 [00:00<?, ?it/s]

In [69]:
## Create a copy for later use
dfff = df2.copy()

In [70]:
# Add host names
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [71]:
dfna.shape

(6476, 6)

In [72]:
df.shape

(1637, 6)

In [73]:
df2 = pd.read_table('../data/virushostdb.tsv')
df2.head(3)

Unnamed: 0,virus tax id,virus name,virus lineage,refseq id,KEGG GENOME,KEGG DISEASE,DISEASE,host tax id,host name,host lineage,pmid,evidence,sample type,source organism
0,438782,Abaca bunchy top virus,Viruses; Monodnaviria; Shotokuvirae; Cressdnav...,"NC_010314, NC_010315, NC_010316, NC_010317, NC...",,,,46838.0,Musa sp.,Eukaryota; Viridiplantae; Streptophyta; Strept...,17978886.0,"Literature, NCBI Virus, RefSeq",,
1,438782,Abaca bunchy top virus,Viruses; Monodnaviria; Shotokuvirae; Cressdnav...,"NC_010314, NC_010315, NC_010316, NC_010317, NC...",,,,214697.0,Musa acuminata AAA Group,Eukaryota; Viridiplantae; Streptophyta; Strept...,17978886.0,Literature,,
2,1241371,Abalone herpesvirus Victoria/AUS/2009,Viruses; Duplodnaviria; Heunggongvirae; Peplov...,NC_018874,,,,6451.0,Haliotidae,Eukaryota; Opisthokonta; Metazoa; Eumetazoa; B...,,UniProt,,


In [74]:
df2 = df2[['virus tax id', 'virus name', 'host tax id', 'host name']].copy()
df2.drop_duplicates(inplace=True)
print(df2.shape)
df2.head()

(16612, 4)


Unnamed: 0,virus tax id,virus name,host tax id,host name
0,438782,Abaca bunchy top virus,46838.0,Musa sp.
1,438782,Abaca bunchy top virus,214697.0,Musa acuminata AAA Group
2,1241371,Abalone herpesvirus Victoria/AUS/2009,6451.0,Haliotidae
3,1241371,Abalone herpesvirus Victoria/AUS/2009,36100.0,Haliotis rubra
4,491893,Abalone shriveling syndrome-associated virus,37770.0,Haliotis diversicolor aquatilis


In [75]:
df2[df2['host tax id'].isnull()]

Unnamed: 0,virus tax id,virus name,host tax id,host name
1236,2662138,Bacteriophage Phobos,,
3750,1131416,Cucurbit mild mosaic virus,,
15925,1888308,Wabat virus,,


In [76]:
df2.dropna(inplace=True)

In [77]:
df2['host tax id'] = df2['host tax id'].astype(int)
df2.head()

Unnamed: 0,virus tax id,virus name,host tax id,host name
0,438782,Abaca bunchy top virus,46838,Musa sp.
1,438782,Abaca bunchy top virus,214697,Musa acuminata AAA Group
2,1241371,Abalone herpesvirus Victoria/AUS/2009,6451,Haliotidae
3,1241371,Abalone herpesvirus Victoria/AUS/2009,36100,Haliotis rubra
4,491893,Abalone shriveling syndrome-associated virus,37770,Haliotis diversicolor aquatilis


In [78]:
df2['Species ID'] = df2['virus tax id'].progress_apply(getRankID, rank='species')

Processing:   0%|          | 0/16609 [00:00<?, ?it/s]

In [79]:
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['host name'], x['host tax id']), axis=1)
# Remove Host and Host ID columns as they have been merged and are no longer needed
df2.drop(['host name', 'host tax id'], axis=1, inplace=True)
df2.head()

Processing:   0%|          | 0/16609 [00:00<?, ?it/s]

Unnamed: 0,virus tax id,virus name,Species ID,Host name
0,438782,Abaca bunchy top virus,438782,Musa sp. [TaxID: 46838]
1,438782,Abaca bunchy top virus,438782,Musa acuminata AAA Group [TaxID: 214697]
2,1241371,Abalone herpesvirus Victoria/AUS/2009,1513231,Haliotidae [TaxID: 6451]
3,1241371,Abalone herpesvirus Victoria/AUS/2009,1513231,Haliotis rubra [TaxID: 36100]
4,491893,Abalone shriveling syndrome-associated virus,491893,Haliotis diversicolor aquatilis [TaxID: 37770]


In [80]:
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [81]:
df.shape

(4760, 6)

In [82]:
dfna.shape

(3353, 6)

In [83]:
df2 = pd.read_csv('../data/virus_host_4rm_untitled.csv')
df2.sample(2)

Unnamed: 0,Host_name,Host_TaxId,Host Group,Virus_name,Virus_TaxId,Micobe_group,Host_common_name,Host_common_name_rev
4218,sus scrofa,9823,mammals,influenza a virus (a/swine/vechta/2623/03(h1n1)),522766,viruses,Wild boar,Pig
59574,mesocricetus auratus,10036,rodents,puumala virus,11604,viruses,Golden hamster,Rodent


In [84]:
df2 = df2[['Host_name', 'Host_TaxId', 'Virus_name', 'Virus_TaxId']].copy()
df2['Species ID'] = df2['Virus_TaxId'].progress_apply(getRankID, rank='species')
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['Host_name'], x['Host_TaxId']), axis=1)
df2.drop(['Host_name', 'Host_TaxId'], axis=1, inplace=True)
df2.dropna(inplace=True)
df2.sample(2)

Processing:   0%|          | 0/59859 [00:00<?, ?it/s]

878474 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found


Processing:   0%|          | 0/59859 [00:00<?, ?it/s]

Unnamed: 0,Virus_name,Virus_TaxId,Species ID,Host name
5317,influenza a virus (a/swine/hong kong/1763/2003...,991722,11320.0,sus scrofa [TaxID: 9823]
19624,influenza a virus (a/shandong-qufu/swl339/2009...,767230,11320.0,homo sapiens [TaxID: 9606]


In [85]:
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [86]:
df.shape

(4766, 6)

In [87]:
dfna.shape

(3347, 6)

In [88]:
dfna.sample(2)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
2536,2593976,Coat protein,Isatis caulimovirus A,Viruses,Caulimoviridae,
2720,2654971,Envelope glycoprotein E1 (EC 2.7.7.48) (EC 3.6...,Bald eagle hepacivirus,Viruses,Flaviviridae,


## Further Processing

In [89]:
# Add column to discriminate viruses which contain human hosts from those which do not
df['Infects human'] = np.where(df['Virus hosts'].str.contains(r'960[56]'), 'human-true','human-false')

In [90]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
4742,2744802,Listeria monocytogenes [TaxID: 1639],Integrase; Uncharacterized protein,Listeria phage LP-HM00113468,Viruses,Siphoviridae,human-false
2620,1920781,Klebsiella pneumoniae [TaxID: 573]; Klebsiella...,Putative head-tail connector protein,Klebsiella virus K244,Viruses,Autographiviridae,human-false


In [91]:
df['Virus hosts'] = df['Virus hosts'].str.split('; ')
df['Virus hosts'] = df.progress_apply(lambda x: list(filter(None, x['Virus hosts'])), axis=1)
df['Virus hosts'] = df['Virus hosts'].progress_apply('; '.join)

Processing:   0%|          | 0/4766 [00:00<?, ?it/s]

Processing:   0%|          | 0/4766 [00:00<?, ?it/s]

In [92]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
3434,2170173,Mycolicibacterium smegmatis MC2 155 [TaxID: 24...,Tyrosine integrase; Portal protein,Mycobacterium virus Phrann,Viruses,Siphoviridae,human-false
1267,889950,Synechococcus sp. WH 7803 [TaxID: 32051],Portal protein (gp20); Uncharacterized protein,Synechococcus phage KBS-M-1A,Viruses,Myoviridae,human-false
1519,1165134,Lactococcus lactis [TaxID: 1358],Putative portal protein; Putative receptor bin...,Lactococcus virus ASCC191,Viruses,Siphoviridae,human-false
3216,2053697,Escherichia coli [TaxID: 562],Head-tail connector,Escherichia virus VEc3,Viruses,Autographiviridae,human-false


In [93]:
df[df['Infects human'] == 'human-true'].sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
1159,682382,Homo sapiens [TaxID: 9606],Capsid protein VP27; Capsid protein VP34; Caps...,HMO Astrovirus A,Viruses,Astroviridae,human-true
16,10310,Homo sapiens (Human) [TaxID: 9606],Envelope glycoprotein H (gH); UL25; Envelope g...,Human alphaherpesvirus 2,Viruses,Herpesviridae,human-true
3854,2560525,Homo sapiens (Human) [TaxID: 9606],Hemagglutinin neuraminidase; Hemagglutinin-neu...,Human orthorubulavirus 2,Viruses,Paramyxoviridae,human-true
836,334209,Homo sapiens [TaxID: 9606],Major capsid protein L1; Minor capsid protein L2,Betapapillomavirus 5,Viruses,Papillomaviridae,human-true


In [94]:
df = (df.set_index(df.columns.drop('Virus hosts',1).tolist())['Virus hosts'].str.split(';', expand=True)
          .stack()
          .reset_index()
          .rename(columns={0:'Virus hosts'})
          .loc[:, df.columns]
         ).copy()

In [95]:
df.shape

(7270, 7)

In [96]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
3573,1511900,Homo sapiens (Human) [TaxID: 9606],Minor capsid protein VP1 (EC 3.1.1.4) (Coat pr...,Primate erythroparvovirus 1,Viruses,Parvoviridae,human-true
5332,2049955,Capraria biflora [TaxID: 255867],Capsid protein (Coat protein),Capraria yellow spot virus,Viruses,Geminiviridae,human-false
441,11307,Sonchus oleraceus (Common sowthistle) [TaxID:...,Spike glycoprotein,Sonchus yellow net nucleorhabdovirus,Viruses,Rhabdoviridae,human-false
1361,186805,Ovis aries [TaxID: 9940],Entry-fusion complex component (IMV membrane p...,Goatpox virus,Viruses,Poxviridae,human-false


In [97]:
df['Virus hosts ID'] = None
idx_organism = df.columns.get_loc('Virus hosts')
idx_host_id = df.columns.get_loc('Virus hosts ID')

pattern = r'(\d+)\]'
for row in range(len(df)):
    host_id = re.search(pattern, df.iat[row, idx_organism]).group()
    df.iat[row, idx_host_id] = host_id
df.head()

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID
0,10243,Apodemus sylvaticus (European woodmouse) [TaxI...,CPXV111 protein (J2R protein); Myristylprotein...,Cowpox virus,Viruses,Poxviridae,human-true,10129]
1,10243,Felis catus (Cat) (Felis silvestris catus) [T...,CPXV111 protein (J2R protein); Myristylprotein...,Cowpox virus,Viruses,Poxviridae,human-true,9685]
2,10243,Myodes glareolus (Bank vole) (Clethrionomys g...,CPXV111 protein (J2R protein); Myristylprotein...,Cowpox virus,Viruses,Poxviridae,human-true,447135]
3,10243,Mus musculus (Mouse) [TaxID: 10090],CPXV111 protein (J2R protein); Myristylprotein...,Cowpox virus,Viruses,Poxviridae,human-true,10090]
4,10243,Loxodonta africana (African elephant) [TaxID:...,CPXV111 protein (J2R protein); Myristylprotein...,Cowpox virus,Viruses,Poxviridae,human-true,9785]


In [98]:
df['Virus hosts ID'] = df['Virus hosts ID'].str.strip('\]')

In [99]:
df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(int)

df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(getRankID, rank='species')
df['Virus host name'] = df['Virus hosts ID'].progress_apply(getRankName, rank='species')
df['Host superkingdom'] = df['Virus hosts ID'].progress_apply(getRankName, rank='superkingdom')
df['Host kingdom'] = df['Virus hosts ID'].progress_apply(getRankName, rank='kingdom')

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

In [100]:
df[df['Virus hosts ID'].isna()]

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom


In [101]:
df['Virus hosts ID'][1866]

274

In [102]:
df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(int)

Processing:   0%|          | 0/7270 [00:00<?, ?it/s]

In [103]:
df['Virus hosts'] = (df.drop('Virus hosts', axis=1)
                     .apply(lambda x: nameMerger(x['Virus host name'], x['Virus hosts ID']), axis=1))

In [104]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
319,11036,Didelphis marsupialis [TaxID: 9268],Assembly protein E3; Precursor of protein E3/E...,Venezuelan equine encephalitis virus,Viruses,Togaviridae,human-true,9268,Didelphis marsupialis,Eukaryota,Metazoa
4412,1914442,Microtus fortis [TaxID: 100897],Capsid protein VP27; Capsid protein VP34; Caps...,Rodent astrovirus,Viruses,Astroviridae,human-false,100897,Microtus fortis,Eukaryota,Metazoa
3128,1317107,Allamanda cathartica [TaxID: 52818],Capsid protein (Coat protein),Allamanda leaf mottle distortion virus,Viruses,Geminiviridae,human-false,52818,Allamanda cathartica,Eukaryota,Viridiplantae
6607,2695849,Escherichia [TaxID: 561],Portal vertex protein,Escherichia phage ESSI2_ev040,Viruses,Myoviridae,human-false,561,Escherichia,Bacteria,Escherichia


In [105]:
df.shape

(7270, 11)

In [106]:
df = (df.set_index(df.columns.drop('Protein names',1).tolist())['Protein names'].str.split(';', expand=True)
          .stack()
          .reset_index()
          .rename(columns={0:'Protein names'})
          .loc[:, df.columns]
         ).copy()

In [107]:
df[df['Host superkingdom'].isnull()].shape

(0, 11)

In [108]:
df['Host superkingdom'].unique()

array(['Eukaryota', 'Bacteria', 'Viruses', 'root', 'Archaea'],
      dtype=object)

In [109]:
df[df['Host superkingdom'] == 'Eukaryota'].shape

(18376, 11)

In [110]:
df[df['Host superkingdom'] == 'Viruses'].shape

(4, 11)

In [111]:
df[df['Host superkingdom'] == 'Bacteria'].shape

(4099, 11)

In [112]:
df[df['Host superkingdom'] == 'root'].shape

(38, 11)

In [113]:
df[df['Host superkingdom'] == 'Archaea'].shape

(14, 11)

In [114]:
print(df[df['Host kingdom'] == 'Metazoa'].shape)
df[df['Host kingdom'] == 'Metazoa'].sample(3)

(17312, 11)


Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
876,10245,Bos taurus [TaxID: 9913],HSPV149,Vaccinia virus,Viruses,Poxviridae,human-true,9913,Bos taurus,Eukaryota,Metazoa
20365,2447804,Myotis davidii [TaxID: 225400],Minor capsid protein VP2 (Minor structural pro...,Myotis davidii polyomavirus 1,Viruses,Polyomaviridae,human-false,225400,Myotis davidii,Eukaryota,Metazoa
19173,2050020,Rattus norvegicus [TaxID: 10116],Major capsid protein L1,Rodent papillomavirus,Viruses,Papillomaviridae,human-false,10116,Rattus norvegicus,Eukaryota,Metazoa


In [115]:
df[df['Infects human'] == 'human-true'].shape

(8457, 11)

In [116]:
df[df['Infects human'] == 'human-false'].shape

(14074, 11)

In [117]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
1224,10273,Oryctolagus cuniculus [TaxID: 9986],M070,Myxoma virus,Viruses,Poxviridae,human-false,9986,Oryctolagus cuniculus,Eukaryota,Metazoa
10743,390437,Pueraria montana [TaxID: 132459],Capsid protein (Coat protein),Kudzu mosaic virus,Viruses,Geminiviridae,human-false,132459,Pueraria montana,Eukaryota,Viridiplantae


In [118]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
8844,123561,Macaca fuscata [TaxID: 9542],Integrase,Simian retrovirus 5,Viruses,Retroviridae,human-false,9542,Macaca fuscata,Eukaryota,Metazoa
5788,12637,Aedes polynesiensis [TaxID: 188700],Envelope protein E,Dengue virus,Viruses,Flaviviridae,human-true,188700,Aedes polynesiensis,Eukaryota,Metazoa


In [119]:
for column in df.columns:
    print(column, df[column].nunique())
print('Dataframe total',len(df))

Species taxonomic ID 4766
Virus hosts 1765
Protein names 2061
Species name 4766
Species superkingdom 1
Species family 80
Infects human 2
Virus hosts ID 1765
Virus host name 1756
Host superkingdom 5
Host kingdom 344
Dataframe total 22531


In [120]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
8176,64286,Culex perexiguus [TaxID: 943103],Non-structural protein 2A (NS2A),Usutu virus,Viruses,Flaviviridae,human-true,943103,Culex perexiguus,Eukaryota,Metazoa
5283,12110,Erinaceidae [TaxID: 9363],Protease 3C (EC 3.4.22.28) (Picornain 3C) (P3...,Foot-and-mouth disease virus,Viruses,Picornaviridae,human-false,9363,Erinaceidae,Eukaryota,Metazoa


## Restructuring the data

In [121]:
# Earlier saved data
dff.sample(2)

Unnamed: 0,Entry,Species taxonomic ID
291632,A0A6M6ATJ7,11676.0
164829,B3TKJ4,11103.0


In [122]:
dff.shape

(358333, 2)

In [140]:
# help(SeqIO.FastaIO)

In [123]:
## Load sequences
Using custom IO instead of Bio.SeqIO for 

In [141]:
from Bio import SeqRecord

In [147]:
# help(SeqRecord)

In [145]:
print(records[0].name, records[0].id)

sp|O11457|VGP_EBOG4 sp|O11457|VGP_EBOG4


In [124]:
fastaFileName = '../data/uniprot-keyword Virus+entry+into+host+cell+[KW-1160] +fragment no.fasta'

entry_seq = read_fasta(fastaFileName)

In [127]:
dff.sort_values(by='Entry', inplace=True)

objList = []
for entry, obj in entry_seq:
    objList.append(obj)

dff['Sequence'] = objList

In [128]:
dff.head()

Unnamed: 0,Entry,Species taxonomic ID,Sequence
50368,A0A009FEK4,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
156673,A0A009G3H3,1310609.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
146717,A0A009GC36,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
146730,A0A009GCG0,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
144753,A0A009GXT7,1310609.0,<zoonosisHelperFunctions.FASTASeq object at 0x...


In [129]:
df.drop(['Virus host name', 'Protein names', 'Species superkingdom'], axis=1, inplace=True)

In [130]:
df = df.merge(dff, on='Species taxonomic ID', how='left')

In [131]:
del dff, df2

In [132]:
df.shape

(48728179, 10)

In [133]:
df.drop_duplicates(inplace=True)

In [134]:
df.shape

(2277584, 10)

In [135]:
df['Virus hosts ID'] = df['Virus hosts ID'].apply(str)

In [136]:
df = (df.groupby('Entry', as_index=False)
       .agg({'Virus hosts':set, #'Protein':'first', 
             'Infects human':'first', 'Species name':'first',
             'Host superkingdom':set,
             'Host kingdom':set,
             'Virus hosts ID':set,
             'Species family':'first',
             'Species taxonomic ID':'first',
             'Sequence': 'first'}))

In [137]:
df['Virus hosts'] = (df['Virus hosts']
                     .swifter.progress_bar(enable=True,
                                           desc='Joining host names list')
                     .apply('; '.join))
df['Virus hosts ID'] = (df['Virus hosts ID']
                        .swifter.progress_bar(enable=True,
                                              desc='Joining host IDs')
                        .apply('; '.join))
df['Host kingdom'] = (df['Host kingdom']
                      .swifter.progress_bar(enable=True,
                                            desc='Joining host kingdom names')
                      .apply('; '.join))
df['Host superkingdom'] = (df['Host superkingdom']
                           .swifter.progress_bar(enable=True,
                                                 desc='Joining host superkingdom names')
                           .apply('; '.join))

Joining host names list:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host IDs:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host kingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host superkingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

In [138]:
df.shape

(317316, 10)

In [139]:
df['Sequence'] = df.progress_apply(lambda x: getSequenceFeatures(
    seqObj=x['Sequence'], entry=x['Entry'],
    organism=x['Species name'], status=x['Infects human']), axis=1)

Joining host superkingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

In [140]:
df['Protein'] = df['Sequence'].apply(lambda x: x.protein_name)

In [141]:
df.sample(3)

Unnamed: 0,Entry,Virus hosts,Infects human,Species name,Host superkingdom,Host kingdom,Virus hosts ID,Species family,Species taxonomic ID,Sequence,Protein
139429,A0A3S5FYS4,Panthera tigris [TaxID: 9694]; Sturnira lilium...,human-true,Influenza A virus,Eukaryota,Metazoa,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Orthomyxoviridae,11320,<zoonosisHelperFunctions.FASTASeq object at 0x...,Nucleoprotein
114041,A0A2P1DR03,Homo sapiens [TaxID: 9606]; Macaca nemestrina ...,human-true,Simian-Human immunodeficiency virus,Eukaryota,Metazoa,9544; 9539; 9545; 9606; 9541,Retroviridae,57667,<zoonosisHelperFunctions.FASTASeq object at 0x...,Envelope glycoprotein gp160
14428,A0A0A1CMQ4,Homo sapiens [TaxID: 9606],human-true,Human immunodeficiency virus 1,Eukaryota,Metazoa,9606,Retroviridae,11676,<zoonosisHelperFunctions.FASTASeq object at 0x...,Envelope glycoprotein gp160


In [142]:
df[df['Infects human'] == 'human-true'].shape

(278789, 11)

In [143]:
df[df['Infects human'] == 'human-false'].shape

(38527, 11)

In [144]:
# Sequences loaded earlier from NCBI Virus ###Add Molecule type
dfff.rename({'Species ID': 'Species taxonomic ID', 'Molecule_type': 'Molecule type'}, axis=1, inplace=True)
dfff.head()

Unnamed: 0,Species,Molecule type,Species taxonomic ID,Host name
0,Epsilonarterivirus zamalb,ssRNA(+),2501966,Chlorocebus [TaxID: 392815]
1,Rodent arterivirus,ssRNA(+),1806636,Eothenomys inez [TaxID: 870526]
2,Wencheng Sm shrew coronavirus,ssRNA(+),1508228,Suncus murinus [TaxID: 9378]
3,Bat coronavirus,ssRNA(+),1508220,Eidolon helvum [TaxID: 77214]
4,NL63-related bat coronavirus strain BtKYNL63-9b,ssRNA(+),2501929,Triaenops afer [TaxID: 549403]


In [145]:
df['Species taxonomic ID'] = df['Species taxonomic ID'].apply(int)

In [146]:
df = df.merge(dfff[['Species taxonomic ID', 'Molecule type']], how='left', on='Species taxonomic ID')

In [147]:
df.shape

(8551317, 12)

In [148]:
df.drop_duplicates(inplace=True)

In [149]:
df.shape

(317316, 12)

In [150]:
del dfff

## Reorganise dataframe

In [151]:
df = df[['Entry', 'Protein', 'Species name', 
         'Species taxonomic ID', 'Species family', 'Virus hosts',
         'Virus hosts ID', 'Host kingdom', 
         'Host superkingdom', 'Molecule type', 'Infects human', 'Sequence']]

In [152]:
df.sample(3)

Unnamed: 0,Entry,Protein,Species name,Species taxonomic ID,Species family,Virus hosts,Virus hosts ID,Host kingdom,Host superkingdom,Molecule type,Infects human,Sequence
6173253,F4MDT2,Nucleoprotein,Influenza A virus,11320,Orthomyxoviridae,Panthera tigris [TaxID: 9694]; Sturnira lilium...,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Metazoa,Eukaryota,ssRNA(-),human-true,<zoonosisHelperFunctions.FASTASeq object at 0x...
7852599,Q9ZXF8,ORF25,Bacillus phage phi105,10717,Siphoviridae,Bacillus subtilis [TaxID: 1423],1423,Bacillus subtilis,Bacteria,,human-false,<zoonosisHelperFunctions.FASTASeq object at 0x...
5438880,C8XMX1,Nucleoprotein,Influenza A virus,11320,Orthomyxoviridae,Panthera tigris [TaxID: 9694]; Sturnira lilium...,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Metazoa,Eukaryota,ssRNA(-),human-true,<zoonosisHelperFunctions.FASTASeq object at 0x...


## Split Dataframe to multiple datasets for testing

In [153]:
df['Host superkingdom'].unique()

array(['Eukaryota', 'Bacteria', 'root', 'Archaea', 'Viruses',
       'root; Eukaryota'], dtype=object)

In [154]:
df['Host kingdom'].unique()

array(['Metazoa', 'Viridiplantae', 'Lactococcus lactis',
       'Escherichia coli', 'Serratia marcescens',
       'Mycolicibacterium smegmatis', 'Bacillus thuringiensis',
       'Trichormus variabilis', 'Listeria monocytogenes',
       'Pseudomonas syringae', 'Metazoa; Viridiplantae',
       'Cronobacter sakazakii', 'Staphylococcus epidermidis',
       'Enterococcus faecium', 'root', 'Caulobacter vibrioides',
       'Vibrio alginolyticus', 'Staphylococcus aureus', 'Bacillus cereus',
       'Ralstonia solanacearum', 'Klebsiella pneumoniae',
       'Staphylococcus xylosus; Staphylococcus aureus',
       'Acinetobacter baumannii', 'Dickeya sp.',
       'Lactobacillus delbrueckii', 'Salmonella', 'Bacillus pumilus',
       'Citrobacter; Citrobacter freundii', 'Mycobacterium',
       'Rhizobium leguminosarum', 'Mesorhizobium loti',
       'Shigella flexneri', 'Yersinia enterocolitica',
       'Idiomarinaceae bacterium N2-2', 'Sulfitobacter sp. CB2047',
       'Lelliottia sp. GL2', 'Clostridi

In [155]:
df[(df['Host kingdom'].str.contains('Viridiplantae')) | df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens')].shape

(284537, 12)

In [156]:
df['Molecule type'] = np.where(df['Molecule type'].isna(), '', df['Molecule type'])

In [157]:
df[df['Molecule type'].isna()]

Unnamed: 0,Entry,Protein,Species name,Species taxonomic ID,Species family,Virus hosts,Virus hosts ID,Host kingdom,Host superkingdom,Molecule type,Infects human,Sequence


In [158]:
df[df['Host kingdom'].str.contains('Metazoa')][df[df['Host kingdom'].str.contains('Metazoa')]['Molecule type'].str.contains('DNA')].shape

(31528, 12)

In [159]:
df[df['Host kingdom'].str.contains('Metazoa')][df[df['Host kingdom'].str.contains('Metazoa')]['Molecule type'].str.contains('RNA')].shape

(273101, 12)

In [160]:
df.shape

(317316, 12)

In [161]:
df[~df['Host kingdom'].str.contains('Metazoa')].shape

(9987, 12)

In [162]:
df[(df['Host superkingdom'].isin(['Bacteria', 'Viruses', 'Archaea'])) | (df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens'))].shape

(283199, 12)

In [163]:
unfiltered = df
metazoa = df[df['Host kingdom'].str.contains('Metazoa')]
plant_human = df[(df['Host kingdom'].str.contains('Viridiplantae')) | df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens')]
NonEukaryote_Human = df[(df['Host superkingdom'].isin(['Bacteria', 'Viruses', 'Archaea'])) | (df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens'))]
DNA_MetazoaZoonosis = metazoa[metazoa['Molecule type'].str.contains('DNA')]
RNA_MetazoaZoonosis = metazoa[metazoa['Molecule type'].str.contains('RNA')]

In [178]:
def check_dist(df):
    true_count = df[df['Infects human'].str.contains('true')].shape[0]
    false_count = df[df['Infects human'].str.contains('false')].shape[0]
    imb = (false_count/true_count)
    print('The minoity class is %.2f of the majority\nhuman-true == %d and human false == %d\n' % (imb, true_count, false_count))

In [179]:
dataframes = [metazoa, unfiltered, plant_human, NonEukaryote_Human, DNA_MetazoaZoonosis, RNA_MetazoaZoonosis]
for dt in dataframes:
    check_dist(dt)

The minoity class is 0.10 of the majority
human-true == 278755 and human false == 28574

The minoity class is 0.14 of the majority
human-true == 278789 and human false == 38527

The minoity class is 0.02 of the majority
human-true == 278789 and human false == 5748

The minoity class is 0.02 of the majority
human-true == 278755 and human false == 4444

The minoity class is 0.29 of the majority
human-true == 24518 and human false == 7010

The minoity class is 0.07 of the majority
human-true == 254072 and human false == 19029



## Random Undersampling of datasets

In [None]:
seed = 960505

In [180]:
# Undersample majority class such that minority class (human-false) is 60% of the majority class (human-true317316)
rus = RandomUnderSampler(sampling_strategy=0.6, random_state=seed)
sampled_dataframes = []
for dt in dataframes:
    clas = dt['Infects human']
#     print('Dataframe before sampling: ', dt.shape[0])
    dt, _ = rus.fit_resample(dt, clas)
    sampled_dataframes.append(dt)
    check_dist(dt)
#     print('Dataframe after sampling: ', dt.shape[0])

The minoity class is 0.60 of the majority
human-true == 47623 and human false == 28574

The minoity class is 0.60 of the majority
human-true == 64211 and human false == 38527

The minoity class is 0.60 of the majority
human-true == 9580 and human false == 5748

The minoity class is 0.60 of the majority
human-true == 7406 and human false == 4444

The minoity class is 0.60 of the majority
human-true == 11683 and human false == 7010

The minoity class is 0.60 of the majority
human-true == 31715 and human false == 19029



## Write file sequences to fasta for feature extraction

In [181]:
metazoaFile = 'MetazoaZoonosis'
plant_humanFile = 'Plant-HumanZoonosis'
unfilteredFile = 'Zoonosis'
NonEukaryote_HumanFile = 'NonEukaryote-Human'
DNA_metazoaFile = 'DNA-MetazoaZoonosis'
RNA_metazoaFile = 'RNA-MetazoaZoonosis'

In [182]:
dirs = ['MetazoaZoonosisData', 'ZoonosisData',
        'Plant-HumanZoonosisData', 'NonEukaryote-HumanData',
        'DNA-MetazoaZoonosisData', 'RNA-MetazoaZoonosisData']
dirs = [os.path.join('../data/', fol) for fol in dirs] # Do not include in script
files = [metazoaFile, unfilteredFile, plant_humanFile, NonEukaryote_HumanFile, DNA_metazoaFile, RNA_metazoaFile]
toSave = list(zip(sampled_dataframes, files, dirs))

In [185]:
for dff, file, folder in toSave:
#    save dataframes as csv
    dff.drop('Sequence', axis=1).to_csv(f'{folder}/{file}Data.csv.gz', index=False, compression='gzip')
    
#    Create subdirectories
    os.makedirs(os.path.join(folder, 'train/human-true'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'test/human-true'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'train/human-false'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'test/human-false'), exist_ok=True)
#    Split data to train and test data
    train, test = train_test_split(dff, test_size=0.2, random_state=) # Will further split 15% of train as validation during training
#    Save test and train sequences
    save_sequences(train, f'{folder}/train/Sequences') # Will move to subdirectories after feature extraction
    save_sequences(test, f'{folder}/test/Sequences')
    
    print('Done with', folder)

Done with ../data/MetazoaZoonosisData
Done with ../data/ZoonosisData
Done with ../data/Plant-HumanZoonosisData
Done with ../data/NonEukaryote-HumanData
Done with ../data/DNA-MetazoaZoonosisData
Done with ../data/RNA-MetazoaZoonosisData
