In [1]:
# Import all necesssary modules
import pandas as pd
import swifter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import re
import os
from ete3 import NCBITaxa
from pprint import pprint
from tqdm.notebook import tqdm_notebook, tqdm
# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)
from zoonosisHelperFunctions import *

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-zq2d34ut because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
tqdm.pandas(desc='Processing')
os.environ["MODIN_ENGINE"] = "ray"

In [3]:
# Load dataset downloaded from Uniprot
df = pd.read_table('../data/uniprot-keyword Virus+entry+into+host+cell+[KW-1160] +fragment no.tab.gz')

In [4]:
df.shape

(358333, 9)

In [5]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
6917,A0A5B7PKI1,A0A5B7PKI1_ECOLX,unreviewed,DUF4102 domain-containing protein,Escherichia coli O26:H11,421,244319,Escherichia coli,
140128,A0A1Z4EAP9,A0A1Z4EAP9_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/snowy owl/Akita/0051D007/...,498,2008558,Influenza A virus,
270626,A0A2H4Z607,A0A2H4Z607_9ENTO,unreviewed,Genome polyprotein [Cleaved into: P3; Protein ...,Echovirus E25,2194,45101,Enterovirus B,


In [6]:
# Check for missing host names
print(df[df['Virus hosts'].isnull()].shape)
df[df['Virus hosts'].isnull()].sample(3)

(237573, 9)


Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
312955,T1R6K5,T1R6K5_9PLVG,unreviewed,Envelope glycoprotein gp160 (Env polyprotein) ...,Simian-Human immunodeficiency virus,847,57667,Simian-Human immunodeficiency virus,
173036,D5XH65,D5XH65_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/New York/7020/2009(H1N1)),498,747793,Influenza A virus,
197640,A0A6G9KP06,A0A6G9KP06_9BETC,unreviewed,Spike glycoprotein (S glycoprotein) (E2) (Pepl...,Pangolin coronavirus,1269,2708335,Pangolin coronavirus,


In [7]:
df[df['Status'] == 'reviewed']['Organism'].nunique()

1518

In [8]:
df[df['Status'] == 'reviewed']['Virus hosts'].nunique()

321

In [9]:
df['Organism'].nunique()

100216

In [10]:
df['Virus hosts'].nunique()

373

In [11]:
df['Taxonomic lineage IDs'].nunique()

100216

In [12]:
df[df['Status'] == 'unreviewed']['Taxonomic lineage IDs'].nunique()

99095

In [13]:
df[df['Status'] == 'unreviewed']['Organism'].nunique()

99095

In [14]:
df[df['Status'] == 'unreviewed']['Virus hosts'].nunique()

200

In [15]:
df['Virus hosts'] = np.where(df['Virus hosts'].isnull(), '',df['Virus hosts'])

In [16]:
df.sample(5)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
334580,A0A411I7D7,A0A411I7D7_9HIV1,unreviewed,Protein Vpr (R ORF protein) (Viral protein R),Human immunodeficiency virus 1,96,11676,Human immunodeficiency virus 1,Homo sapiens (Human) [TaxID: 9606]
155719,A0A7D6Z8I6,A0A7D6Z8I6_CITFR,unreviewed,Tyrosine-type recombinase/integrase,Citrobacter freundii,397,546,Citrobacter freundii,
5453,A0A379CKX0,A0A379CKX0_PLESH,unreviewed,Prophage CP4-57 integrase,Plesiomonas shigelloides (Aeromonas shigelloides),447,703,Plesiomonas shigelloides (Aeromonas shigelloides),
111350,A0A5B8HLH9,A0A5B8HLH9_9INFA,unreviewed,Hemagglutinin [Cleaved into: Hemagglutinin HA1...,Influenza A virus,566,11320,Influenza A virus,
140544,B3FRK9,B3FRK9_COCAV,unreviewed,Glycoprotein,Cocal virus (COCV),512,50713,Cocal vesiculovirus,Bos taurus (Bovine) [TaxID: 9913]; Equus cabal...


In [17]:
def join_names(df, col_name: str):
    df[col_name] = df[col_name].str.split('; ').apply(set).apply('; '.join)
    return df

In [18]:
# df['Virus hosts'] = df['Virus hosts'].str.split('; ')
# df['Virus hosts'] = df['Virus hosts'].swifter.progress_bar(enable=True, desc='Removing duplicate host names').apply(set)
# df['Virus hosts'] = df['Virus hosts'].swifter.progress_bar(enable=True, desc='Joining host names list').apply('; '.join)

# df['Protein names'] = df['Protein names'].str.split('; ')
# df['Protein names'] = df['Protein names'].swifter.progress_bar(enable=True, desc='Removing duplicate protein names').apply(set)
# df['Protein names'] = df['Protein names'].swifter.progress_bar(enable=True, desc='Joining protein names list').apply('; '.join)

# df['Organism'] = df['Organism'].str.split('; ')
# df['Organism'] = df['Organism'].swifter.progress_bar(enable=True, desc='Removing duplicate organism names').apply(set)
# df['Organism'] = df['Organism'].swifter.progress_bar(enable=True, desc='Joining organism names list').apply('; '.join)

In [19]:
df = join_names(df, 'Virus hosts')
df = join_names(df, 'Protein names')
df = join_names(df, 'Organism')

In [20]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
263851,A0A1P8A7G8,A0A1P8A7G8_9ALPC,unreviewed,Spike glycoprotein (S glycoprotein) (E2) (Pepl...,Porcine epidemic diarrhea virus,1386,28295,Porcine epidemic diarrhea virus,
28370,D3TG04,D3TG04_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/Australia/79/2009(H1N1)),566,708581,Influenza A virus,
246526,A0A2H4TC06,A0A2H4TC06_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/mallard duck/Ohio/16OS090...,498,2050536,Influenza A virus,


In [21]:
df[~df['Virus hosts'].isnull()].sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts
161477,A0A0C5ABA5,A0A0C5ABA5_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus (A/chicken/Jiangxi/29132/201...,498,1593973,Influenza A virus,
145102,A0A736I4P4,A0A736I4P4_SALHO,unreviewed,DUF4102 domain-containing protein,Salmonella enterica subsp. houtenae serovar 44...,420,1967609,Salmonella enterica (Salmonella choleraesuis),
133606,A0A2L1L644,A0A2L1L644_9INFA,unreviewed,Nucleoprotein (Nucleocapsid protein) (Protein N),Influenza A virus,498,11320,Influenza A virus,


In [22]:
# Apply function to get species ID from organism ID
df['Species taxonomic ID'] = (df['Taxonomic lineage IDs']
                              .swifter.progress_bar(enable=True, desc='Getting Viruses taxonomic IDs')
                              .apply(getRankID, rank='species'))

Getting Viruses taxonomic IDs:   0%|          | 0/16 [00:00<?, ?it/s]

In [23]:
dff = df[['Entry', 'Species taxonomic ID']].copy()

In [24]:
df.sample(3)

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID
169420,A0A060LAH9,A0A060LAH9_9INFB,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza B virus (B/North Carolina/3881/2014),584,1506131,Influenza B virus,,11520.0
34650,Q3S9U0,Q3S9U0_9HIV1,unreviewed,Protein Vpr (R ORF protein) (Viral protein R),Human immunodeficiency virus 1,96,11676,Human immunodeficiency virus 1,Homo sapiens (Human) [TaxID: 9606],11676.0
269803,A0A1B0RX79,A0A1B0RX79_9INFA,unreviewed,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus (A/duck/Guangxi/217/2013(H6N6)),566,1692019,Influenza A virus,,11320.0


In [25]:
df[df['Species taxonomic ID'].isnull()]

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID
11119,O41892,O41892_PEGIA,unreviewed,Genome polyprotein (EC 3.6.4.13),Pegivirus A (isolate Saguinus labiatus/-/GBV-A...,2967,1554494,Pegivirus A,Leontocebus nigricollis (Black-mantled tamarin...,
18609,Q0NCQ7,Q0NCQ7_VAR65,unreviewed,Protein G3,Variola virus (isolate Human/South Africa/102/...,111,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
30670,Q0NG66,Q0NG66_VAR46,unreviewed,Myristylprotein,Variola virus (isolate Human/Japan/Yamada MS-2...,340,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],
84779,Q0NCI5,Q0NCI5_VAR65,unreviewed,Fusion protein,Variola virus (isolate Human/South Africa/102/...,146,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
84815,Q0NCJ6,Q0NCJ6_VAR65,unreviewed,Virion membrane protein A21,Variola virus (isolate Human/South Africa/102/...,117,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
87500,B2BX77,B2BX77_ASPVR,unreviewed,Hemagglutinin-neuraminidase protein,Atlantic salmon paramyxovirus (isolate -/Norwa...,576,1283346,Salmon aquaparamyxovirus,Salmo salar (Atlantic salmon) [TaxID: 8030],
98638,Q0NCN6,Q0NCN6_VAR65,unreviewed,IMV membrane protein,Variola virus (isolate Human/South Africa/102/...,189,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
99867,Q0NCP9,Q0NCP9_VAR65,unreviewed,Myristylprotein,Variola virus (isolate Human/South Africa/102/...,340,587201,Variola virus,Homo sapiens (Human) [TaxID: 9606],
115389,Q0NG61,Q0NG61_VAR46,unreviewed,Protein L5,Variola virus (isolate Human/Japan/Yamada MS-2...,128,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],
116649,Q0NG40,Q0NG40_VAR46,unreviewed,Carbonic anhydrase homolog (Cell surface-bindi...,Variola virus (isolate Human/Japan/Yamada MS-2...,304,587202,Variola virus,Homo sapiens (Human) [TaxID: 9606],


In [26]:
# Get the species name of the earlier unidentified taxonomic IDs
idx_species_name = df.columns.get_loc('Taxonomic lineage (SPECIES)')
idx_organism_id = df.columns.get_loc('Species taxonomic ID')

for row in tqdm_notebook(range(len(df)), desc='Getting species ID from organism name'):
    if np.isnan(df.iat[row, idx_organism_id]):
        df.iat[row, idx_organism_id] = getIDfromName(df.iat[row, idx_species_name])

Getting species ID from organism name:   0%|          | 0/358333 [00:00<?, ?it/s]

In [27]:
df[df['Species taxonomic ID'].isnull()]

Unnamed: 0,Entry,Entry name,Status,Protein names,Organism,Length,Taxonomic lineage IDs,Taxonomic lineage (SPECIES),Virus hosts,Species taxonomic ID


In [28]:
df['Species taxonomic ID'] = df['Species taxonomic ID'].apply(int)

In [29]:
df.shape

(358333, 10)

In [30]:
df = (df.drop(['Status','Taxonomic lineage IDs'], axis=1)
      .groupby('Species taxonomic ID', as_index=False)
      .agg({'Virus hosts':set, 'Organism':set,
            'Protein names':set, 'Taxonomic lineage (SPECIES)':'first'}))

In [31]:
df['Virus hosts'] = df['Virus hosts'].str.join('; ')
df['Organism'] = df['Organism'].str.join('; ')
df['Protein names'] = df['Protein names'].str.join('; ')

In [32]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Taxonomic lineage (SPECIES)
2780,392734,,Terriglobus roseus,Integrase,Terriglobus roseus
658,28365,Xanthomonas campestris pv. campestris [TaxID: ...,Xanthomonas phage phiLf (Bacteriophage phi-Lf),Attachment protein G3P (Gene 3 protein) (G3P) ...,Xanthomonas phage phiLf (Bacteriophage phi-Lf)
630,28109,,Pseudoalteromonas nigrifaciens,Prophage CP4-57 integrase,Pseudoalteromonas nigrifaciens
3973,988801,,Rosenbergiella nectarea,Tyr recombinase domain-containing protein; Int...,Rosenbergiella nectarea
1552,115986,,Burkholderia phage 42,Tail sheath protein gp18,Burkholderia phage 42


In [33]:
df.shape

(15104, 5)

In [34]:
df['Species name'] = (df.drop('Taxonomic lineage (SPECIES)', axis=1)
                      .swifter.progress_bar(enable=True, desc='Getting Species name')
                      .apply(lambda x: getRankName(x['Species taxonomic ID'], 
                                                   rank='species'), axis=1))

Getting Species name:   0%|          | 0/15104 [00:00<?, ?it/s]

In [35]:
df['Species superkingdom'] = df['Species taxonomic ID'].progress_apply(getRankName, rank='superkingdom')

Getting Species name:   0%|          | 0/15104 [00:00<?, ?it/s]

In [36]:
df['Species family'] = df['Species taxonomic ID'].progress_apply(getRankName, rank='family')

Getting Species name:   0%|          | 0/15104 [00:00<?, ?it/s]

In [37]:
df['Species superkingdom'].unique()

array(['Bacteria', 'Archaea', 'Eukaryota', 'Viruses', 'IncJ plasmid R391',
       'uncultured organism', 'metagenome', 'Plasmid pFKY1',
       'human gut metagenome', 'marine metagenome',
       'mine drainage metagenome', 'marine sediment metagenome',
       'freshwater metagenome',
       'uncultured marine microorganism HF4000_005I08',
       'wastewater metagenome', 'hydrothermal vent metagenome',
       'sediment metagenome', 'viral metagenome', 'biofilter metagenome',
       'bioreactor metagenome', 'anaerobic digester metagenome',
       'plant metagenome', 'invertebrate metagenome'], dtype=object)

In [38]:
df = df[df['Species superkingdom'] == 'Viruses']

In [39]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Taxonomic lineage (SPECIES),Species name,Species superkingdom,Species family
5898,1606498,,bat polyomavirus 3a-A1055,Minor capsid protein VP2 (Minor structural pro...,Artibeus planirostris polyomavirus 2,Artibeus planirostris polyomavirus 2,Viruses,Polyomaviridae
13242,2599833,,Arthrobacter phage Shepard,Portal protein,Arthrobacter phage Shepard,Arthrobacter phage Shepard,Viruses,Siphoviridae
10580,2170404,,Glis glis polyomavirus 1,Minor capsid protein VP2 (Minor structural pro...,Glis glis polyomavirus 1,Glis glis polyomavirus 1,Viruses,Polyomaviridae
8675,1980468,,El Moro Canyon orthohantavirus,Envelope polyprotein (M polyprotein),El Moro Canyon orthohantavirus,El Moro Canyon orthohantavirus,Viruses,Hantaviridae
5397,1476886,,Lactococcus phage P078,Tail length tape-measure protein,Lactococcus phage P078,Lactococcus phage P078,Viruses,Siphoviridae


In [40]:
df.drop(['Taxonomic lineage (SPECIES)'], axis=1, inplace=True)

In [44]:
df['Virus hosts'] = np.where(df['Virus hosts']=='', np.nan, df['Virus hosts'])

In [45]:
df[df['Virus hosts'].isnull()].sample(5)

Unnamed: 0,Species taxonomic ID,Virus hosts,Organism,Protein names,Species name,Species superkingdom,Species family
11842,2510151,,Staphylococcus phage Sa30,Uncharacterized protein,Staphylococcus phage Sa30,Viruses,Herelleviridae
8908,1985983,,Chicken picornavirus 3,Genome polyprotein (P1C) (P1D) (Protein 2C) (P...,Avisivirus C,Viruses,Picornaviridae
2452,320839,,Staphylococcus virus 47,Integrase; ORF009,Staphylococcus virus 47,Viruses,Siphoviridae
11339,2447815,,Rhinolophus ferrumequinum polyomavirus 2,Minor capsid protein VP2 (Minor structural pro...,Rhinolophus ferrumequinum polyomavirus 2,Viruses,Polyomaviridae
13125,2592199,,Salmonella phage SE20,Portal protein,Salmonella phage SE20,Viruses,Demerecviridae


In [46]:
df.drop('Organism', axis=1, inplace=True)

In [47]:
# List of viruses which do not have assigned hosts in the data
noHostViruses = df[df['Virus hosts'].isnull()]['Species name'].unique().tolist()

In [48]:
# Create independent dataframe of viruses with no assigned host and simltaneously identify the same viruses from the data 
# whcih already have assigned hosts and assign host names based on those. 
df_na_hosts = df[(~df['Virus hosts'].isnull()) & (df['Species name'].isin(noHostViruses))][['Species name', 'Virus hosts']]
df_na_hosts = df_na_hosts.groupby('Species name')['Virus hosts'].apply(list)
df_na_hosts = df_na_hosts.reset_index(name='Viral hosts nw')

In [49]:
# # Previous code reurns a list for multiple host so this code melts the lists into regular string entries
df_na_hosts['Viral hosts nw'] = (df_na_hosts['Viral hosts nw']
                                 .swifter.progress_bar(desc='Joining host names list', enable=True)
                                 .apply('; '.join))

In [50]:
# # updates the viruses hosts
df_naa = (df[df['Virus hosts'].isnull()]
          .merge(df_na_hosts, on='Species name', how='left')
          .drop('Virus hosts', axis=1)
          .rename({'Viral hosts nw':'Virus hosts'}, axis=1))

In [51]:
# Creates independant dataset with viruses which have hosts
df_notna = df[~df['Virus hosts'].isnull()]

In [52]:
# merges the updated virus hosts dataset with the dataset with viruses which have hosts
df = df_naa.append(df_notna)

In [53]:
df.shape

(8113, 6)

In [54]:
df.sample(5)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
5454,2560564,G protein,Lophuromys jeilongvirus 2,Viruses,Paramyxoviridae,
4404,2301528,Portal protein,Lentibacter virus vB_LenP_ICBM1,Viruses,Podoviridae,
1977,1636724,Portal protein,Lactococcus phage 936 group phage PhiA.16,Viruses,Siphoviridae,
5086,2548025,Integrase,Streptococcus phage Javan178,Viruses,Siphoviridae,
1138,1090133,Integrase,Enterobacteria phage IME10,Viruses,Podoviridae,


In [55]:
print(df[df['Virus hosts'].isnull()].shape)
df[df['Virus hosts'].isnull()].sample(3)

(7512, 6)


Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
7221,2734284,Portal protein,Gordonia virus Trine,Viruses,Siphoviridae,
693,500654,Major capsid protein L1; Minor capsid protein L2,Bandicoot papillomatosis carcinomatosis virus ...,Viruses,Papillomaviridae,
132,64300,Core protein (EC 3.4.21.91) (EC 3.6.1.15) (EC ...,Modoc virus,Viruses,Flaviviridae,


In [56]:
df['Virus hosts'] = np.where(df['Virus hosts'].isnull(), '',df['Virus hosts'])

In [57]:
df.sample(3)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
924,768739,Envelope glycoprotein L (gL); Envelope glycopr...,Elephant endotheliotropic herpesvirus 6,Viruses,Herpesviridae,
2330,1868220,Cap,Bat associated circovirus 3,Viruses,Circoviridae,
3805,2064743,Pre-histone-like nucleoprotein (Pre-core prote...,Rhesus adenovirus 67,Viruses,Adenoviridae,


In [58]:
df = mergeRows(df, 'Species taxonomic ID','Virus hosts')

In [59]:
df[(df['Species name'].str.contains('Influenza A virus')) & (df['Virus hosts'] != '')]

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family
161,11320,; Aves [TaxID: 8782]; Phocidae (true seals) [T...,Hemagglutinin HA2 chain]; Hemagglutinin [Cleav...,Influenza A virus,Viruses,Orthomyxoviridae


In [60]:
df.sample(3)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family
5661,2548013,,Portal protein,Streptococcus phage Javan159,Viruses,Siphoviridae
1367,759390,,Capsid protein (Coat protein),Sida golden mottle virus,Viruses,Geminiviridae
18,10320,; Bos taurus (Bovine) [TaxID: 9913],Envelope glycoprotein L; Envelope glycoprotein...,Bovine alphaherpesvirus 1,Viruses,Herpesviridae


In [61]:
dfna = df[df['Virus hosts'] == '']
df = df[~(df['Virus hosts'] == '')]

In [62]:
dfna.shape

(7512, 6)

In [63]:
df.shape

(601, 6)

# Updating host names from external sources

In [64]:
df2 = pd.read_csv('../data/sequences.csv')
df2.shape

(2599675, 3)

In [65]:
df2.sample(2)

Unnamed: 0,Species,Molecule_type,Host
2425601,Human immunodeficiency virus 1,ssRNA-RT,Homo sapiens
2484045,Influenza A virus,ssRNA(-),Homo sapiens


In [66]:
df2.drop_duplicates(inplace=True)
df2.shape

(10956, 3)

In [67]:
df2['Species ID'], df2['Host ID'] = df2['Species'].progress_apply(getIDfromName), df2['Host'].progress_apply(getIDfromName)

Getting Species name:   0%|          | 0/10956 [00:00<?, ?it/s]

'nan'
'nan'
'nan'
'nan'


Getting Species name:   0%|          | 0/10956 [00:00<?, ?it/s]

'Bolomys lasiurus'
'Bolomys lasiurus'
'Pipistrellus sp. pipistrellus/pygmaeus AO-2021'
'Pipistrellus musciculus'
'Funisciurus bayonii'
'Rattus sp. r3 YH-2020'
'Rattus sp. r3 YH-2020'
'Soricidae sp. YH-2020'
'Rattus sp. r3 YH-2020'
'Rattus sp. r3 YH-2020'
'Acomys selousi'
'Rhinolophus smithersi'
'Alouatta sp.'
'Pipistrellys abramus'
'Sturnira angeli'
'Sturnira angeli'
'Hipposideros curtus'
'Pipistrellus inexspectatus'
'Dobsonia exoleta'
'Mops demonstrator'
'Pipistrellus musciculus'
'Mus sp. TG-2020'
'Murinae gen. sp. TG-2020'
'Vespadelus baverstocki'
'Ozimops sp. DP-2019'
'Scoterepens balstoni'
'Neoromicia capensis'
'Neoromicia capensis'
'Mus sp. CL-2019'
'Mus sp. CL-2019'
'Neoromicia capensis'
'Bolomys lasiurus'
'Bolomys lasiurus'
'Bolomys lasiurus'
'Neoromicia capensis'
'Neoromicia capensis'
'Neoromicia capensis'
'Neoromicia capensis'
'Bolomys lasiurus'
'Pipistrellus inexspectatus'
'Chiroptera sp.'
'Chaerephon aloysiisabaudiae'
'Chiroptera sp.'
'Paradoxurus musangus'
'Neoromicia capen

In [68]:
df2.dropna(inplace=True)
df2['Species ID'], df2['Host ID'] = df2['Species ID'].astype(int), df2['Host ID'].astype(int)
df2.shape

(10886, 5)

In [69]:
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['Host'], x['Host ID']), axis=1)
# Remove Host and Host ID columns as they have been merged and are no longer needed
df2.drop(['Host', 'Host ID'], axis=1, inplace=True)

Getting Species name:   0%|          | 0/10886 [00:00<?, ?it/s]

In [70]:
df2['Species ID'] = df2['Species ID'].progress_apply(getRankID, rank='species')

Getting Species name:   0%|          | 0/10886 [00:00<?, ?it/s]

In [71]:
## Create a copy for later use
dfff = df2.copy()

In [72]:
# Add host names
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [73]:
dfna.shape

(6476, 6)

In [74]:
df.shape

(1637, 6)

In [75]:
df2 = pd.read_table('../data/virushostdb.tsv')
df2.head(3)

Unnamed: 0,virus tax id,virus name,virus lineage,refseq id,KEGG GENOME,KEGG DISEASE,DISEASE,host tax id,host name,host lineage,pmid,evidence,sample type,source organism
0,438782,Abaca bunchy top virus,Viruses; Monodnaviria; Shotokuvirae; Cressdnav...,"NC_010314, NC_010315, NC_010316, NC_010317, NC...",,,,46838.0,Musa sp.,Eukaryota; Viridiplantae; Streptophyta; Strept...,17978886.0,"Literature, NCBI Virus, RefSeq",,
1,438782,Abaca bunchy top virus,Viruses; Monodnaviria; Shotokuvirae; Cressdnav...,"NC_010314, NC_010315, NC_010316, NC_010317, NC...",,,,214697.0,Musa acuminata AAA Group,Eukaryota; Viridiplantae; Streptophyta; Strept...,17978886.0,Literature,,
2,1241371,Abalone herpesvirus Victoria/AUS/2009,Viruses; Duplodnaviria; Heunggongvirae; Peplov...,NC_018874,,,,6451.0,Haliotidae,Eukaryota; Opisthokonta; Metazoa; Eumetazoa; B...,,UniProt,,


In [76]:
df2 = df2[['virus tax id', 'virus name', 'host tax id', 'host name']].copy()
df2.drop_duplicates(inplace=True)
print(df2.shape)
df2.head()

(16612, 4)


Unnamed: 0,virus tax id,virus name,host tax id,host name
0,438782,Abaca bunchy top virus,46838.0,Musa sp.
1,438782,Abaca bunchy top virus,214697.0,Musa acuminata AAA Group
2,1241371,Abalone herpesvirus Victoria/AUS/2009,6451.0,Haliotidae
3,1241371,Abalone herpesvirus Victoria/AUS/2009,36100.0,Haliotis rubra
4,491893,Abalone shriveling syndrome-associated virus,37770.0,Haliotis diversicolor aquatilis


In [77]:
df2[df2['host tax id'].isnull()]

Unnamed: 0,virus tax id,virus name,host tax id,host name
1236,2662138,Bacteriophage Phobos,,
3750,1131416,Cucurbit mild mosaic virus,,
15925,1888308,Wabat virus,,


In [78]:
df2.dropna(inplace=True)

In [79]:
df2['host tax id'] = df2['host tax id'].astype(int)
df2.head()

Unnamed: 0,virus tax id,virus name,host tax id,host name
0,438782,Abaca bunchy top virus,46838,Musa sp.
1,438782,Abaca bunchy top virus,214697,Musa acuminata AAA Group
2,1241371,Abalone herpesvirus Victoria/AUS/2009,6451,Haliotidae
3,1241371,Abalone herpesvirus Victoria/AUS/2009,36100,Haliotis rubra
4,491893,Abalone shriveling syndrome-associated virus,37770,Haliotis diversicolor aquatilis


In [80]:
df2['Species ID'] = df2['virus tax id'].progress_apply(getRankID, rank='species')

Getting Species name:   0%|          | 0/16609 [00:00<?, ?it/s]

In [81]:
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['host name'], x['host tax id']), axis=1)
# Remove Host and Host ID columns as they have been merged and are no longer needed
df2.drop(['host name', 'host tax id'], axis=1, inplace=True)
df2.head()

Getting Species name:   0%|          | 0/16609 [00:00<?, ?it/s]

Unnamed: 0,virus tax id,virus name,Species ID,Host name
0,438782,Abaca bunchy top virus,438782,Musa sp. [TaxID: 46838]
1,438782,Abaca bunchy top virus,438782,Musa acuminata AAA Group [TaxID: 214697]
2,1241371,Abalone herpesvirus Victoria/AUS/2009,1513231,Haliotidae [TaxID: 6451]
3,1241371,Abalone herpesvirus Victoria/AUS/2009,1513231,Haliotis rubra [TaxID: 36100]
4,491893,Abalone shriveling syndrome-associated virus,491893,Haliotis diversicolor aquatilis [TaxID: 37770]


In [82]:
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [83]:
df.shape

(4760, 6)

In [84]:
dfna.shape

(3353, 6)

In [85]:
df2 = pd.read_csv('../data/virus_host_4rm_untitled.csv')
df2.sample(2)

Unnamed: 0,Host_name,Host_TaxId,Host Group,Virus_name,Virus_TaxId,Micobe_group,Host_common_name,Host_common_name_rev
49661,anas platyrhynchos,8839,vertebrates,influenza a virus (a/duck/chiba/4/2006(h3n8)),698850,viruses,Mallard,Mallard
35433,homo sapiens,9606,primates,influenza a virus (a/incheon/243/2005(h3n2)),514413,viruses,Human,Human


In [86]:
df2 = df2[['Host_name', 'Host_TaxId', 'Virus_name', 'Virus_TaxId']].copy()
df2['Species ID'] = df2['Virus_TaxId'].progress_apply(getRankID, rank='species')
df2['Host name'] = df2.progress_apply(lambda x: nameMerger(x['Host_name'], x['Host_TaxId']), axis=1)
df2.drop(['Host_name', 'Host_TaxId'], axis=1, inplace=True)
df2.dropna(inplace=True)
df2.sample(2)

Getting Species name:   0%|          | 0/59859 [00:00<?, ?it/s]

878474 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found
555869 taxid not found


Getting Species name:   0%|          | 0/59859 [00:00<?, ?it/s]

Unnamed: 0,Virus_name,Virus_TaxId,Species ID,Host name
37908,influenza a virus (a/austria/140481/2004(h3n2)),468434,11320.0,homo sapiens [TaxID: 9606]
58530,gallid herpesvirus 2,10390,10390.0,gallus gallus [TaxID: 9031]


In [87]:
df_na_hosts = AggregateHosts(df2,'Species ID', 'Host name')
dfna = dfna.merge(df_na_hosts, left_on='Species taxonomic ID', right_on='Species ID', how='left')
dfna = dfna.drop(['Virus hosts', 'Species ID'], axis=1).rename({'Host name':'Virus hosts'}, axis=1)
dfna = UpdateHosts(dfna, df_na_hosts, 'Species taxonomic ID', 'Species ID')
df, dfna = UpdateMain(df, dfna)
df = mergeRows(df, 'Species taxonomic ID', 'Virus hosts')

In [88]:
df.shape

(4766, 6)

In [89]:
dfna.shape

(3347, 6)

In [90]:
dfna.sample(2)

Unnamed: 0,Species taxonomic ID,Protein names,Species name,Species superkingdom,Species family,Virus hosts
1039,2041543,Portal protein; Integrase,Mycobacterium phage Murica,Viruses,Siphoviridae,
1180,2094138,Integrase,Mycobacterium phage Morrow,Viruses,Siphoviridae,


## Further Processing

In [91]:
# Add column to discriminate viruses which contain human hosts from those which do not
df['Infects human'] = np.where(df['Virus hosts'].str.contains(r'960[56]'), 'human-true','human-false')

In [92]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
2855,1981160,Escherichia coli O157:H7 [TaxID: 83334],Integrase; Tail spike protein,Escherichia virus 24B,Viruses,Podoviridae,human-false
630,204086,Staphylococcus aureus [TaxID: 1280],Portal protein (Connector protein) (Gene produ...,Staphylococcus virus 44AHJD,Viruses,Rountreeviridae,human-false


In [93]:
df['Virus hosts'] = df['Virus hosts'].str.split('; ')
df['Virus hosts'] = df.progress_apply(lambda x: list(filter(None, x['Virus hosts'])), axis=1)
df['Virus hosts'] = df['Virus hosts'].progress_apply('; '.join)

Getting Species name:   0%|          | 0/4766 [00:00<?, ?it/s]

Getting Species name:   0%|          | 0/4766 [00:00<?, ?it/s]

In [94]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
341,49267,Solanum lycopersicum (Tomato) (Lycopersicon es...,Capsid protein (Coat protein) (CP),Tomato pseudo-curly top virus,Viruses,Geminiviridae,human-false
3514,2249765,Paenibacillus larvae [TaxID: 1464],Portal protein; Integrase,Paenibacillus phage Eltigre,Viruses,Siphoviridae,human-false
3582,2315695,Escherichia coli [TaxID: 562]; Salmonella sp. ...,Portal (Connector) protein,Escherichia phage fp01,Viruses,Demerecviridae,human-false
3840,2560492,Gordonia terrae [TaxID: 2055],Portal protein; Integrase,Gordonia virus Eyre,Viruses,Siphoviridae,human-false


In [95]:
df[df['Infects human'] == 'human-true'].sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
16,10310,Homo sapiens (Human) [TaxID: 9606],Envelope glycoprotein C (Glycoprotein F); Urac...,Human alphaherpesvirus 2,Viruses,Herpesviridae,human-true
2786,1972577,Bos taurus (Bovine) [TaxID: 9913]; Equus cabal...,Uncharacterized protein; Glycoprotein; G protein,Indiana vesiculovirus,Viruses,Rhabdoviridae,human-true
3046,2003649,Homo sapiens [TaxID: 9606],Genome polyprotein (P1C) (P1D) (Protein 2C) (P...,Cosavirus B,Viruses,Picornaviridae,human-true
841,337039,Homo sapiens (Human) [TaxID: 9606],Major capsid protein L1; Minor capsid protein L2,Alphapapillomavirus 2,Viruses,Papillomaviridae,human-true


In [96]:
df = (df.set_index(df.columns.drop('Virus hosts',1).tolist())['Virus hosts'].str.split(';', expand=True)
          .stack()
          .reset_index()
          .rename(columns={0:'Virus hosts'})
          .loc[:, df.columns]
         ).copy()

In [97]:
df.shape

(7270, 7)

In [98]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human
2368,947980,Lactobacillus sp. [TaxID: 1591],Bacteriophage portal protein; Integrase,Lactobacillus phage LF1,Viruses,Siphoviridae,human-false
1771,360579,Malvastrum coromandelianum [TaxID: 108453],Capsid protein (Coat protein),Mesta yellow vein mosaic virus,Viruses,Geminiviridae,human-false
57,10334,Felidae (cat family) [TaxID: 9681],Envelope glycoprotein L; Glycoprotein C; Envel...,Felid alphaherpesvirus 1,Viruses,Herpesviridae,human-false
5535,2169701,Homo sapiens (Human) [TaxID: 9606],Assembly protein E3; Structural polyprotein (p...,Onyong-nyong virus,Viruses,Togaviridae,human-true


In [99]:
df['Virus hosts ID'] = None
idx_organism = df.columns.get_loc('Virus hosts')
idx_host_id = df.columns.get_loc('Virus hosts ID')

pattern = r'(\d+)\]'
for row in range(len(df)):
    host_id = re.search(pattern, df.iat[row, idx_organism]).group()
    df.iat[row, idx_host_id] = host_id
df.head()

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID
0,10243,Myodes glareolus (Bank vole) (Clethrionomys gl...,CPXV163 protein; CPXV108 protein (Late 16 kDa ...,Cowpox virus,Viruses,Poxviridae,human-true,447135]
1,10243,Bos taurus (Bovine) [TaxID: 9913],CPXV163 protein; CPXV108 protein (Late 16 kDa ...,Cowpox virus,Viruses,Poxviridae,human-true,9913]
2,10243,Microtus agrestis (Short-tailed field vole) [...,CPXV163 protein; CPXV108 protein (Late 16 kDa ...,Cowpox virus,Viruses,Poxviridae,human-true,29092]
3,10243,Loxodonta africana (African elephant) [TaxID:...,CPXV163 protein; CPXV108 protein (Late 16 kDa ...,Cowpox virus,Viruses,Poxviridae,human-true,9785]
4,10243,Homo sapiens (Human) [TaxID: 9606],CPXV163 protein; CPXV108 protein (Late 16 kDa ...,Cowpox virus,Viruses,Poxviridae,human-true,9606]


In [100]:
df['Virus hosts ID'] = df['Virus hosts ID'].str.strip('\]')

In [101]:
df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(int)

df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(getRankID, rank='species')
df['Virus host name'] = df['Virus hosts ID'].progress_apply(getRankName, rank='species')
df['Host superkingdom'] = df['Virus hosts ID'].progress_apply(getRankName, rank='superkingdom')
df['Host kingdom'] = df['Virus hosts ID'].progress_apply(getRankName, rank='kingdom')

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

In [102]:
df[df['Virus hosts ID'].isna()]

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom


In [103]:
df['Virus hosts ID'][1866]

274

In [104]:
df['Virus hosts ID'] = df['Virus hosts ID'].progress_apply(int)

Getting Species name:   0%|          | 0/7270 [00:00<?, ?it/s]

In [105]:
df['Virus hosts'] = (df.drop('Virus hosts', axis=1)
                     .apply(lambda x: nameMerger(x['Virus host name'], x['Virus hosts ID']), axis=1))

In [106]:
df.sample(4)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
6819,2733575,Proteus mirabilis [TaxID: 584],Head-tail connector,Proteus virus PM93,Viruses,Autographiviridae,human-false,584,Proteus mirabilis,Bacteria,Proteus mirabilis
5858,2304449,Okapia johnstoni [TaxID: 86973],Major capsid protein L1; Minor capsid protein L2,Okapia johnstoni papillomavirus 1,Viruses,Papillomaviridae,human-false,86973,Okapia johnstoni,Eukaryota,Metazoa
120,10497,Ornithodoros [TaxID: 6937],Envelope protein p54,African swine fever virus,Viruses,Asfarviridae,human-false,6937,Ornithodoros,Eukaryota,Metazoa
653,28875,Chlorocebus pygerythrus [TaxID: 60710],Truncated outer capsid protein VP8; Outer caps...,Rotavirus A,Viruses,Reoviridae,human-true,60710,Chlorocebus pygerythrus,Eukaryota,Metazoa


In [107]:
df.shape

(7270, 11)

In [108]:
df = (df.set_index(df.columns.drop('Protein names',1).tolist())['Protein names'].str.split(';', expand=True)
          .stack()
          .reset_index()
          .rename(columns={0:'Protein names'})
          .loc[:, df.columns]
         ).copy()

In [109]:
df[df['Host superkingdom'].isnull()].shape

(0, 11)

In [110]:
df['Host superkingdom'].unique()

array(['Eukaryota', 'Bacteria', 'Viruses', 'root', 'Archaea'],
      dtype=object)

In [111]:
df[df['Host superkingdom'] == 'Eukaryota'].shape

(18376, 11)

In [112]:
df[df['Host superkingdom'] == 'Viruses'].shape

(4, 11)

In [113]:
df[df['Host superkingdom'] == 'Bacteria'].shape

(4099, 11)

In [114]:
df[df['Host superkingdom'] == 'root'].shape

(38, 11)

In [115]:
df[df['Host superkingdom'] == 'Archaea'].shape

(14, 11)

In [116]:
print(df[df['Host kingdom'] == 'Metazoa'].shape)
df[df['Host kingdom'] == 'Metazoa'].sample(3)

(17312, 11)


Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
7129,41118,Rodentia [TaxID: 9989],Pre-glycoprotein polyprotein GP complex (Pre-...,Arenavirus sp.,Viruses,Arenaviridae,human-false,9989,Rodentia,Eukaryota,Metazoa
6368,28875,Mus musculus [TaxID: 10090],Outer capsid protein VP4,Rotavirus A,Viruses,Reoviridae,human-true,10090,Mus musculus,Eukaryota,Metazoa
19666,2169902,Homo sapiens [TaxID: 9606],Major capsid protein L1,Gammapapillomavirus 27,Viruses,Papillomaviridae,human-true,9606,Homo sapiens,Eukaryota,Metazoa


In [117]:
df[df['Infects human'] == 'human-true'].shape

(8457, 11)

In [118]:
df[df['Infects human'] == 'human-false'].shape

(14074, 11)

In [119]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
15061,1511906,Canis lupus [TaxID: 9612],Capsid protein (Virus protein 2),Carnivore protoparvovirus 1,Viruses,Parvoviridae,human-false,9612,Canis lupus,Eukaryota,Metazoa
1662,10561,Bos taurus [TaxID: 9913],Minor capsid protein L2,Xipapillomavirus 1,Viruses,Papillomaviridae,human-false,9913,Bos taurus,Eukaryota,Metazoa


In [120]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
1807,10713,Escherichia coli [TaxID: 562],Integrase (EC 2.7.7.-) (EC 3.1.-.-),Enterobacteria phage phi80,Viruses,Siphoviridae,human-false,562,Escherichia coli,Bacteria,Escherichia coli
12369,1147143,Escherichia coli [TaxID: 562],Integrase,Enterobacteria phage HK140,Viruses,Siphoviridae,human-false,562,Escherichia coli,Bacteria,Escherichia coli


In [121]:
for column in df.columns:
    print(column, df[column].nunique())
print('Dataframe total',len(df))

Species taxonomic ID 4766
Virus hosts 1765
Protein names 2038
Species name 4766
Species superkingdom 1
Species family 80
Infects human 2
Virus hosts ID 1765
Virus host name 1756
Host superkingdom 5
Host kingdom 344
Dataframe total 22531


In [122]:
df.sample(2)

Unnamed: 0,Species taxonomic ID,Virus hosts,Protein names,Species name,Species superkingdom,Species family,Infects human,Virus hosts ID,Virus host name,Host superkingdom,Host kingdom
13152,1283076,Candidatus Pelagibacter ubique [TaxID: 198252],Tail sheath protein,Pelagibacter phage HTVC008M,Viruses,Myoviridae,human-false,198252,Candidatus Pelagibacter ubique,Bacteria,Candidatus Pelagibacter ubique
12716,1193974,Primates [TaxID: 9443],Capsid protein VP3 (P1C) (Virion protein 3),Human enterovirus,Viruses,Picornaviridae,human-true,9443,Primates,Eukaryota,Metazoa


# Restructuring the data

In [123]:
# Earlier saved data
dff.sample(2)

Unnamed: 0,Entry,Species taxonomic ID
240053,H8YFX1,526476.0
285606,B6EFF8,11676.0


In [124]:
dff.shape

(358333, 2)

In [126]:
fastaFileName = '../data/uniprot-keyword Virus+entry+into+host+cell+[KW-1160] +fragment no.fasta'

entry_seq = read_fasta(fastaFileName)

In [127]:
dff.sort_values(by='Entry', inplace=True)

objList = []
for entry, obj in entry_seq:
    objList.append(obj)

dff['Sequence'] = objList

In [128]:
dff.head()

Unnamed: 0,Entry,Species taxonomic ID,Sequence
50368,A0A009FEK4,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
156673,A0A009G3H3,1310609.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
146717,A0A009GC36,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
146730,A0A009GCG0,470.0,<zoonosisHelperFunctions.FASTASeq object at 0x...
144753,A0A009GXT7,1310609.0,<zoonosisHelperFunctions.FASTASeq object at 0x...


In [129]:
df.drop(['Virus host name', 'Protein names', 'Species superkingdom'], axis=1, inplace=True)

In [130]:
df = df.merge(dff, on='Species taxonomic ID', how='left')

In [131]:
del dff, df2

In [132]:
df.shape

(48728179, 10)

In [133]:
df.drop_duplicates(inplace=True)

In [134]:
df.shape

(2277584, 10)

In [135]:
df['Virus hosts ID'] = df['Virus hosts ID'].apply(str)

In [136]:
df = (df.groupby('Entry', as_index=False)
       .agg({'Virus hosts':set, #'Protein':'first', 
             'Infects human':'first', 'Species name':'first',
             'Host superkingdom':set,
             'Host kingdom':set,
             'Virus hosts ID':set,
             'Species family':'first',
             'Species taxonomic ID':'first',
             'Sequence': 'first'}))

In [137]:
df['Virus hosts'] = (df['Virus hosts']
                     .swifter.progress_bar(enable=True,
                                           desc='Joining host names list')
                     .apply('; '.join))
df['Virus hosts ID'] = (df['Virus hosts ID']
                        .swifter.progress_bar(enable=True,
                                              desc='Joining host IDs')
                        .apply('; '.join))
df['Host kingdom'] = (df['Host kingdom']
                      .swifter.progress_bar(enable=True,
                                            desc='Joining host kingdom names')
                      .apply('; '.join))
df['Host superkingdom'] = (df['Host superkingdom']
                           .swifter.progress_bar(enable=True,
                                                 desc='Joining host superkingdom names')
                           .apply('; '.join))

Joining host names list:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host IDs:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host kingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

Joining host superkingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

In [138]:
df.shape

(317316, 10)

In [139]:
df['Sequence'] = df.progress_apply(lambda x: getSequenceFeatures(
    seqObj=x['Sequence'], entry=x['Entry'],
    organism=x['Species name'], status=x['Infects human']), axis=1)

Joining host superkingdom names:   0%|          | 0/317316 [00:00<?, ?it/s]

In [140]:
df['Protein'] = df['Sequence'].apply(lambda x: x.protein_name)

In [141]:
df.sample(3)

Unnamed: 0,Entry,Virus hosts,Infects human,Species name,Host superkingdom,Host kingdom,Virus hosts ID,Species family,Species taxonomic ID,Sequence,Protein
139429,A0A3S5FYS4,Panthera tigris [TaxID: 9694]; Sturnira lilium...,human-true,Influenza A virus,Eukaryota,Metazoa,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Orthomyxoviridae,11320,<zoonosisHelperFunctions.FASTASeq object at 0x...,Nucleoprotein
114041,A0A2P1DR03,Homo sapiens [TaxID: 9606]; Macaca nemestrina ...,human-true,Simian-Human immunodeficiency virus,Eukaryota,Metazoa,9544; 9539; 9545; 9606; 9541,Retroviridae,57667,<zoonosisHelperFunctions.FASTASeq object at 0x...,Envelope glycoprotein gp160
14428,A0A0A1CMQ4,Homo sapiens [TaxID: 9606],human-true,Human immunodeficiency virus 1,Eukaryota,Metazoa,9606,Retroviridae,11676,<zoonosisHelperFunctions.FASTASeq object at 0x...,Envelope glycoprotein gp160


In [142]:
df[df['Infects human'] == 'human-true'].shape

(278789, 11)

In [143]:
df[df['Infects human'] == 'human-false'].shape

(38527, 11)

In [144]:
# Sequences loaded earlier from NCBI Virus ###Add Molecule type
dfff.rename({'Species ID': 'Species taxonomic ID', 'Molecule_type': 'Molecule type'}, axis=1, inplace=True)
dfff.head()

Unnamed: 0,Species,Molecule type,Species taxonomic ID,Host name
0,Epsilonarterivirus zamalb,ssRNA(+),2501966,Chlorocebus [TaxID: 392815]
1,Rodent arterivirus,ssRNA(+),1806636,Eothenomys inez [TaxID: 870526]
2,Wencheng Sm shrew coronavirus,ssRNA(+),1508228,Suncus murinus [TaxID: 9378]
3,Bat coronavirus,ssRNA(+),1508220,Eidolon helvum [TaxID: 77214]
4,NL63-related bat coronavirus strain BtKYNL63-9b,ssRNA(+),2501929,Triaenops afer [TaxID: 549403]


In [145]:
df['Species taxonomic ID'] = df['Species taxonomic ID'].apply(int)

In [146]:
df = df.merge(dfff[['Species taxonomic ID', 'Molecule type']], how='left', on='Species taxonomic ID')

In [147]:
df.shape

(8551317, 12)

In [148]:
df.drop_duplicates(inplace=True)

In [149]:
df.shape

(317316, 12)

In [150]:
del dfff

## Reorganise dataframe

In [151]:
df = df[['Entry', 'Protein', 'Species name', 
         'Species taxonomic ID', 'Species family', 'Virus hosts',
         'Virus hosts ID', 'Host kingdom', 
         'Host superkingdom', 'Molecule type', 'Infects human', 'Sequence']]

In [152]:
df.sample(3)

Unnamed: 0,Entry,Protein,Species name,Species taxonomic ID,Species family,Virus hosts,Virus hosts ID,Host kingdom,Host superkingdom,Molecule type,Infects human,Sequence
6173253,F4MDT2,Nucleoprotein,Influenza A virus,11320,Orthomyxoviridae,Panthera tigris [TaxID: 9694]; Sturnira lilium...,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Metazoa,Eukaryota,ssRNA(-),human-true,<zoonosisHelperFunctions.FASTASeq object at 0x...
7852599,Q9ZXF8,ORF25,Bacillus phage phi105,10717,Siphoviridae,Bacillus subtilis [TaxID: 1423],1423,Bacillus subtilis,Bacteria,,human-false,<zoonosisHelperFunctions.FASTASeq object at 0x...
5438880,C8XMX1,Nucleoprotein,Influenza A virus,11320,Orthomyxoviridae,Panthera tigris [TaxID: 9694]; Sturnira lilium...,9666; 27660; 9685; 9721; 9709; 9823; 8782; 969...,Metazoa,Eukaryota,ssRNA(-),human-true,<zoonosisHelperFunctions.FASTASeq object at 0x...


## Split Dataframe to multiple datasets for testing

In [153]:
df['Host superkingdom'].unique()

array(['Eukaryota', 'Bacteria', 'root', 'Archaea', 'Viruses',
       'root; Eukaryota'], dtype=object)

In [154]:
df['Host kingdom'].unique()

array(['Metazoa', 'Viridiplantae', 'Lactococcus lactis',
       'Escherichia coli', 'Serratia marcescens',
       'Mycolicibacterium smegmatis', 'Bacillus thuringiensis',
       'Trichormus variabilis', 'Listeria monocytogenes',
       'Pseudomonas syringae', 'Metazoa; Viridiplantae',
       'Cronobacter sakazakii', 'Staphylococcus epidermidis',
       'Enterococcus faecium', 'root', 'Caulobacter vibrioides',
       'Vibrio alginolyticus', 'Staphylococcus aureus', 'Bacillus cereus',
       'Ralstonia solanacearum', 'Klebsiella pneumoniae',
       'Staphylococcus xylosus; Staphylococcus aureus',
       'Acinetobacter baumannii', 'Dickeya sp.',
       'Lactobacillus delbrueckii', 'Salmonella', 'Bacillus pumilus',
       'Citrobacter; Citrobacter freundii', 'Mycobacterium',
       'Rhizobium leguminosarum', 'Mesorhizobium loti',
       'Shigella flexneri', 'Yersinia enterocolitica',
       'Idiomarinaceae bacterium N2-2', 'Sulfitobacter sp. CB2047',
       'Lelliottia sp. GL2', 'Clostridi

In [155]:
df[(df['Host kingdom'].str.contains('Viridiplantae')) | df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens')].shape

(284537, 12)

In [156]:
df['Molecule type'] = np.where(df['Molecule type'].isna(), '', df['Molecule type'])

In [157]:
df[df['Molecule type'].isna()]

Unnamed: 0,Entry,Protein,Species name,Species taxonomic ID,Species family,Virus hosts,Virus hosts ID,Host kingdom,Host superkingdom,Molecule type,Infects human,Sequence


In [158]:
df[df['Host kingdom'].str.contains('Metazoa')][df[df['Host kingdom'].str.contains('Metazoa')]['Molecule type'].str.contains('DNA')].shape

(31528, 12)

In [159]:
df[df['Host kingdom'].str.contains('Metazoa')][df[df['Host kingdom'].str.contains('Metazoa')]['Molecule type'].str.contains('RNA')].shape

(273101, 12)

In [160]:
df.shape

(317316, 12)

In [161]:
df[~df['Host kingdom'].str.contains('Metazoa')].shape

(9987, 12)

In [162]:
df[(df['Host superkingdom'].isin(['Bacteria', 'Viruses', 'Archaea'])) | (df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens'))].shape

(283199, 12)

In [163]:
unfiltered = df
metazoa = df[df['Host kingdom'].str.contains('Metazoa')]
plant_human = df[(df['Host kingdom'].str.contains('Viridiplantae')) | df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens')]
NonEukaryote_Human = df[(df['Host superkingdom'].isin(['Bacteria', 'Viruses', 'Archaea'])) | (df['Virus hosts'].str.contains('[Hh]omo [Ss]apiens'))]
DNA_MetazoaZoonosis = metazoa[metazoa['Molecule type'].str.contains('DNA')]
RNA_MetazoaZoonosis = metazoa[metazoa['Molecule type'].str.contains('RNA')]

In [178]:
def check_dist(df):
    true_count = df[df['Infects human'].str.contains('true')].shape[0]
    false_count = df[df['Infects human'].str.contains('false')].shape[0]
    imb = (false_count/true_count)
    print('The minoity class is %.2f of the majority\nhuman-true == %d and human false == %d\n' % (imb, true_count, false_count))

In [179]:
dataframes = [metazoa, unfiltered, plant_human, NonEukaryote_Human, DNA_MetazoaZoonosis, RNA_MetazoaZoonosis]
for dt in dataframes:
    check_dist(dt)

The minoity class is 0.10 of the majority
human-true == 278755 and human false == 28574

The minoity class is 0.14 of the majority
human-true == 278789 and human false == 38527

The minoity class is 0.02 of the majority
human-true == 278789 and human false == 5748

The minoity class is 0.02 of the majority
human-true == 278755 and human false == 4444

The minoity class is 0.29 of the majority
human-true == 24518 and human false == 7010

The minoity class is 0.07 of the majority
human-true == 254072 and human false == 19029



## Random Undersampling of datasets

In [None]:
seed = 960505

In [180]:
# Undersample majority class such that minority class (human-false) is 60% of the majority class (human-true317316)
rus = RandomUnderSampler(sampling_strategy=0.6, random_state=seed)
sampled_dataframes = []
for dt in dataframes:
    clas = dt['Infects human']
#     print('Dataframe before sampling: ', dt.shape[0])
    dt, _ = rus.fit_resample(dt, clas)
    sampled_dataframes.append(dt)
    check_dist(dt)
#     print('Dataframe after sampling: ', dt.shape[0])

The minoity class is 0.60 of the majority
human-true == 47623 and human false == 28574

The minoity class is 0.60 of the majority
human-true == 64211 and human false == 38527

The minoity class is 0.60 of the majority
human-true == 9580 and human false == 5748

The minoity class is 0.60 of the majority
human-true == 7406 and human false == 4444

The minoity class is 0.60 of the majority
human-true == 11683 and human false == 7010

The minoity class is 0.60 of the majority
human-true == 31715 and human false == 19029



## Write file sequences to fasta for feature extraction

In [181]:
metazoaFile = 'MetazoaZoonosis'
plant_humanFile = 'Plant-HumanZoonosis'
unfilteredFile = 'Zoonosis'
NonEukaryote_HumanFile = 'NonEukaryote-Human'
DNA_metazoaFile = 'DNA-MetazoaZoonosis'
RNA_metazoaFile = 'RNA-MetazoaZoonosis'

In [182]:
dirs = ['MetazoaZoonosisData', 'ZoonosisData',
        'Plant-HumanZoonosisData', 'NonEukaryote-HumanData',
        'DNA-MetazoaZoonosisData', 'RNA-MetazoaZoonosisData']
dirs = [os.path.join('../data/', fol) for fol in dirs] # Do not include in script
files = [metazoaFile, unfilteredFile, plant_humanFile, NonEukaryote_HumanFile, DNA_metazoaFile, RNA_metazoaFile]
toSave = list(zip(sampled_dataframes, files, dirs))

In [185]:
for dff, file, folder in toSave:
#    save dataframes as csv
    dff.drop('Sequence', axis=1).to_csv(f'{folder}/{file}Data.csv.gz', index=False, compression='gzip')
    
#    Create subdirectories
    os.makedirs(os.path.join(folder, 'train/human-true'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'test/human-true'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'train/human-false'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'test/human-false'), exist_ok=True)
#    Split data to train and test data
    train, test = train_test_split(dff, test_size=0.2, random_state=) # Will further split 15% of train as validation during training
#    Save test and train sequences
    save_sequences(train, f'{folder}/train/Sequences') # Will move to subdirectories after feature extraction
    save_sequences(test, f'{folder}/test/Sequences')
    
    print('Done with', folder)

Done with ../data/MetazoaZoonosisData
Done with ../data/ZoonosisData
Done with ../data/Plant-HumanZoonosisData
Done with ../data/NonEukaryote-HumanData
Done with ../data/DNA-MetazoaZoonosisData
Done with ../data/RNA-MetazoaZoonosisData
