In [17]:
import pandas as pd
import numpy as np

In [18]:
# SEE: https://zenodo.org/record/4632335#.YkGzOShBz4c
filename = 'GlobalAlienSpeciesFirstRecordDatabase_v2.xlsx'

df_orig = pd.read_excel(filename, sheet_name=None)

In [19]:
# There are 3 sheets of the excel file
print(df_orig.keys())
# Get the correct one
df = df_orig['GlobalAlienSpeciesFirstRecordDa']
df = df[df['LifeForm'] == 'Viruses']
df = df[df['FirstRecord'] > 1950]

dict_keys(['ReadMe', 'GlobalAlienSpeciesFirstRecordDa', 'References'])


In [20]:
df.info(show_counts=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 61521 to 61570
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   TaxonName         44 non-null     object
 1   scientificName    44 non-null     object
 2   Family            31 non-null     object
 3   Order             29 non-null     object
 4   Class             28 non-null     object
 5   Phylum            27 non-null     object
 6   Taxonomy          27 non-null     object
 7   OrigName          44 non-null     object
 8   LifeForm          44 non-null     object
 9   Region            44 non-null     object
 10  Island            7 non-null      object
 11  PresentStatus     1 non-null      object
 12  FirstRecord       44 non-null     int64 
 13  FirstRecord_orig  44 non-null     object
 14  Source            44 non-null     object
dtypes: int64(1), object(14)
memory usage: 5.5+ KB


Unnamed: 0,TaxonName,scientificName,Family,Order,Class,Phylum,Taxonomy,OrigName,LifeForm,Region,Island,PresentStatus,FirstRecord,FirstRecord_orig,Source
61521,Beet necrotic yellow vein virus,Beet necrotic yellow vein virus (BNYVV),,,,,,Beet necrotic yellow vein virus (BNYVV),Viruses,China,,,1978,1978,Xu et al. (2012)
61522,Broad bean strain virus,Broad bean strain virus (BBSV),,,,,,Broad bean strain virus (BBSV),Viruses,China,,,1998,1998,Xu et al. (2012)
61523,Cassava mosaic disease,Cassava mosaic disease,,,,,,Cassava mosaic disease,Viruses,Kenya,,,1994,1994,CABI ISC
61524,Cucumber green mottle mosaic virus,Cucumber green mottle mosaic virus (CGMMV),,,,,,Cucumber green mottle mosaic virus (CGMMV),Viruses,China,,,2005,2005,Xu et al. (2012)
61525,East Asian Passiflora virus,East Asian Passiflora virus,Potyviridae,Patatavirales,Stelpaviricetes,Pisuviricota,GBIF,East Asian Passiflora virus,Viruses,Taiwan,yes,,1980,1980,CABI ISC


In [21]:
family = df['LifeForm'].unique()
region = df['Region'].unique()

print(f'There are {len(family)} families and {len(region)} regions')
species = df['TaxonName'].unique()

print(f'There are {len(species)} species.')
for p in family:
    df_s = df[df['LifeForm'] == p]
    animals = df_s['TaxonName'].unique()
    print(f'{p}[{len(animals)}]', end=', ')


There are 1 families and 31 regions
There are 18 species.
Viruses[18], 

In [41]:
# Buld matrix
n_s = len(species)
n_r = len(region)

t_min = df['FirstRecord'].min()
t_max = df['FirstRecord'].max()
time = np.arange(t_min, t_max, 2)

p = n_s + n_r
M = np.zeros((len(time), n_s, n_r))
# M = np.zeros((len(time), p, p)) # if full

for i, t in enumerate(time):
    df_now = df[(df['FirstRecord'] >= t) & (df['FirstRecord'] < t+1)]
    for index, row in df_now.iterrows():
        s = row['TaxonName']
        r = row['Region']
        # print(f'Species {s} invaded region {r} at time {t}')

        s_idx = np.where(species == s)
        r_idx = np.where(region == r)
        M[i, s_idx, r_idx] = 1
M.shape

Species Beet necrotic yellow vein virus invaded region China at time 1961
Species Broad bean strain virus invaded region China at time 1961
Species Cassava mosaic disease invaded region Kenya at time 1961
Species Cucumber green mottle mosaic virus invaded region China at time 1961
Species East Asian Passiflora virus invaded region Taiwan at time 1961
Species genus/Begomovirus, species/Tomato yellow leaf curl virus pv. Israel invaded region Japan at time 1961
Species genus/Potyvirus, species/Papaya ringspot virus biotype P invaded region Japan at time 1961
Species genus/Tospovirus, species/Tomato spotted wilt virus invaded region Japan at time 1961
Species Impatiens necrotic spot virus invaded region China at time 1961
Species Infectious hematopoietic necrosis virus invaded region China at time 1961
Species Infectious pancreatic necrosis virus invaded region China at time 1961
Species Lymphocystis disease virus invaded region China at time 1961
Species Plum pox virus invaded region Alba

(24, 18, 31)

In [42]:
# Export data and then import it into R
print(n_s, n_r)
M = M.reshape(len(time), n_s*n_r)
# M = M.reshape(len(time), (p)**2) # if full matix
print(f's = {n_s}, r = {n_r}')
print(M.shape)
np.save('matrix_full.npy', M)

18 31
s = 18, r = 31
(24, 558)


In [9]:
# TODO we should use sparse matrices for the data

import scipy.sparse

rows = [2, 236, 246, 389, 1691]
cols = [117, 3, 34, 2757, 74, 1635, 52]
prod = [(x, y) for x in rows for y in cols] # combinations
r = [x for (x, y) in prod] # x_coordinate
c = [y for (x, y) in prod] # y_coordinate
data = [1] * len(r)
m = scipy.sparse.coo_matrix((data, (r, c)), shape=(100000, 40000))