In [1]:
import pandas as pd
import numpy as np

In [2]:
# SEE: https://zenodo.org/record/4632335#.YkGzOShBz4c
filename = '../data/GlobalAlienSpeciesFirstRecordDatabase_v2.xlsx'

df_orig = pd.read_excel(filename, sheet_name=None)

In [3]:
# There are 3 sheets of the excel file
print(df_orig.keys())
# Get the correct one
df = df_orig['GlobalAlienSpeciesFirstRecordDa']
df = df[df['LifeForm'] == 'Viruses']
df = df[df['FirstRecord'] > 1950]

dict_keys(['ReadMe', 'GlobalAlienSpeciesFirstRecordDa', 'References'])


In [4]:
df.info(show_counts=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 61521 to 61570
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   TaxonName         44 non-null     object
 1   scientificName    44 non-null     object
 2   Family            31 non-null     object
 3   Order             29 non-null     object
 4   Class             28 non-null     object
 5   Phylum            27 non-null     object
 6   Taxonomy          27 non-null     object
 7   OrigName          44 non-null     object
 8   LifeForm          44 non-null     object
 9   Region            44 non-null     object
 10  Island            7 non-null      object
 11  PresentStatus     1 non-null      object
 12  FirstRecord       44 non-null     int64 
 13  FirstRecord_orig  44 non-null     object
 14  Source            44 non-null     object
dtypes: int64(1), object(14)
memory usage: 5.5+ KB


Unnamed: 0,TaxonName,scientificName,Family,Order,Class,Phylum,Taxonomy,OrigName,LifeForm,Region,Island,PresentStatus,FirstRecord,FirstRecord_orig,Source
61521,Beet necrotic yellow vein virus,Beet necrotic yellow vein virus (BNYVV),,,,,,Beet necrotic yellow vein virus (BNYVV),Viruses,China,,,1978,1978,Xu et al. (2012)
61522,Broad bean strain virus,Broad bean strain virus (BBSV),,,,,,Broad bean strain virus (BBSV),Viruses,China,,,1998,1998,Xu et al. (2012)
61523,Cassava mosaic disease,Cassava mosaic disease,,,,,,Cassava mosaic disease,Viruses,Kenya,,,1994,1994,CABI ISC
61524,Cucumber green mottle mosaic virus,Cucumber green mottle mosaic virus (CGMMV),,,,,,Cucumber green mottle mosaic virus (CGMMV),Viruses,China,,,2005,2005,Xu et al. (2012)
61525,East Asian Passiflora virus,East Asian Passiflora virus,Potyviridae,Patatavirales,Stelpaviricetes,Pisuviricota,GBIF,East Asian Passiflora virus,Viruses,Taiwan,yes,,1980,1980,CABI ISC


In [5]:
family = df['LifeForm'].unique()
region = df['Region'].unique()

print(f'There are {len(family)} families and {len(region)} regions')
species = df['TaxonName'].unique()

print(f'There are {len(species)} species.')
for p in family:
    df_s = df[df['LifeForm'] == p]
    animals = df_s['TaxonName'].unique()
    print(f'{p}[{len(animals)}]', end=', ')


There are 1 families and 31 regions
There are 18 species.
Viruses[18], 

In [7]:
# Filter out irrelevant data
spec_iter = np.zeros(len(species), dtype=int)

for idx, s in enumerate(species):
    df_s = df[df['TaxonName'] == s]
    # n_iter = len(df_s)
    spec_iter[idx] = len(df_s)
    # print(f'The species {s} has {spec_iter[idx]} iteractions')
    
    # print(df_s)
filter = spec_iter[0]



The species Beet necrotic yellow vein virus has 1 iteractions
The species Broad bean strain virus has 1 iteractions
The species Cassava mosaic disease has 1 iteractions
The species Cucumber green mottle mosaic virus has 1 iteractions
The species East Asian Passiflora virus has 1 iteractions
The species genus/Begomovirus, species/Tomato yellow leaf curl virus pv. Israel has 1 iteractions
The species genus/Potyvirus, species/Papaya ringspot virus biotype P has 1 iteractions
The species genus/Tospovirus, species/Tomato spotted wilt virus has 1 iteractions
The species Impatiens necrotic spot virus has 1 iteractions
The species Infectious hematopoietic necrosis virus has 1 iteractions
The species Infectious pancreatic necrosis virus has 1 iteractions
The species Lymphocystis disease virus has 1 iteractions
The species Plum pox virus has 26 iteractions
The species Poplar mosaic virus has 1 iteractions
The species Prunus necrotic ringspot ilarvirus has 1 iteractions
The species Rhabdovirus ca

In [40]:
# select the species that have less than 3 iteractions
idx = np.where(spec_iter < 3)
# test = df

# Filter the dataset: remove all rows belonging to species with less than 3 iteractions
for s in species[idx]:
    # print(s)  
    df = df.drop(df[df['TaxonName'] == s].index)

print(len(df['TaxonName'].unique()))

1


In [14]:
# Buld matrix
n_s = len(species)
n_r = len(region)

t_min = df['FirstRecord'].min()
t_max = df['FirstRecord'].max()
time = np.arange(t_min, t_max, 2)

p = n_s + n_r
M = np.zeros((len(time), n_s, n_r))
# M = np.zeros((len(time), p, p)) # if full

for i, t in enumerate(time):
    df_now = df[(df['FirstRecord'] >= t) & (df['FirstRecord'] < t+1)]
    for index, row in df_now.iterrows():
        s = row['TaxonName']
        r = row['Region']
        # print(f'Species {s} invaded region {r} at time {t}')

        s_idx = np.where(species == s)
        r_idx = np.where(region == r)
        M[i, s_idx, r_idx] = 1
M.shape

(24, 18, 31)

In [45]:
print(f'There are in total {np.sum(M)} iteraction.')

idx = np.array([5,  8,  9, 13, 17, 18, 20, 24, 25, 29, 30])
idx = idx - 1
most_relevant_spec = np.sum(M[:, 12, idx])
print(f'Of which {most_relevant_spec} are given by species n 12 {species[12]}')

There are in total 17.0 iteraction.
Of which 11.0 are given by species n 12 Plum pox virus


In [12]:
# Export data and then import it into R
print(n_s, n_r)
M = M.reshape(len(time), n_s*n_r)
# M = M.reshape(len(time), (p)**2) # if full matix
print(f's = {n_s}, r = {n_r}')
print(M.shape)
np.save('matrix.npy', M)

18 31
s = 18, r = 31
(24, 558)


In [11]:
# TODO we should use sparse matrices for the data

import scipy.sparse

rows = [2, 236, 246, 389, 1691]
cols = [117, 3, 34, 2757, 74, 1635, 52]
prod = [(x, y) for x in rows for y in cols] # combinations
r = [x for (x, y) in prod] # x_coordinate
c = [y for (x, y) in prod] # y_coordinate
data = [1] * len(r)
m = scipy.sparse.coo_matrix((data, (r, c)), shape=(100000, 40000))