In [14]:
import pandas as pd
import numpy as np

In [15]:
# SEE: https://zenodo.org/record/4632335#.YkGzOShBz4c
filename = '../data/GlobalAlienSpeciesFirstRecordDatabase_v2.xlsx'

df_orig = pd.read_excel(filename, sheet_name=None)

In [4]:
# There are 3 sheets of the excel file
print(df_orig.keys())
# Get the correct one
df = df_orig['GlobalAlienSpeciesFirstRecordDa']
df = df[(df['LifeForm'] == 'Viruses') & (df['LifeForm'] == 'Viruses')]
df = df[df['FirstRecord'] > 1950]

dict_keys(['ReadMe', 'GlobalAlienSpeciesFirstRecordDa', 'References'])


In [9]:
df.info(show_counts=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16377 entries, 8 to 61741
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   TaxonName         16377 non-null  object
 1   scientificName    16377 non-null  object
 2   Family            16125 non-null  object
 3   Order             16269 non-null  object
 4   Class             16301 non-null  object
 5   Phylum            16350 non-null  object
 6   Taxonomy          15984 non-null  object
 7   OrigName          16377 non-null  object
 8   LifeForm          16364 non-null  object
 9   Region            16377 non-null  object
 10  Island            4575 non-null   object
 11  PresentStatus     10494 non-null  object
 12  FirstRecord       16377 non-null  int64 
 13  FirstRecord_orig  16377 non-null  object
 14  Source            16377 non-null  object
dtypes: int64(1), object(14)
memory usage: 2.0+ MB


Unnamed: 0,TaxonName,scientificName,Family,Order,Class,Phylum,Taxonomy,OrigName,LifeForm,Region,Island,PresentStatus,FirstRecord,FirstRecord_orig,Source
8,Achnanthes pseudogroenlandica,"Achnanthes pseudogroenlandica Hendey, 1964",Achnanthaceae,Achnanthales,Bacillariophyceae,Ochrophyta,GBIF,Achnanthes pseudogroenlandica,Algae,Bulgaria,,,1984,1984,AquaNIS (2020)
9,Achnanthes pseudogroenlandica,"Achnanthes pseudogroenlandica Hendey, 1964",Achnanthaceae,Achnanthales,Bacillariophyceae,Ochrophyta,GBIF,Achnanthes pseudogroenlandica,Algae,Romania,,,1984,1984,AquaNIS (2020)
10,Achnanthes pseudogroenlandica,"Achnanthes pseudogroenlandica Hendey, 1964",Achnanthaceae,Achnanthales,Bacillariophyceae,Ochrophyta,GBIF,Achnanthes pseudogroenlandica,Algae,Ukraine,,,1984,1984,DAISIE
32,Agardhiella subulata,Agardhiella subulata (C.Agardh) Kraft & M.J.Wynne,Solieriaceae,Gigartinales,Florideophyceae,Rhodophyta,GBIF,Agardhiella subulata,Algae,France,,established,1984,1984,DAISIE
33,Agardhiella subulata,Agardhiella subulata (C.Agardh) Kraft & M.J.Wynne,Solieriaceae,Gigartinales,Florideophyceae,Rhodophyta,GBIF,Agardhiella subulata,Algae,Italy,,,1987,1987,DAISIE


In [6]:
family = df['LifeForm'].unique()
region = df['Region'].unique()

print(f'There are {len(family)} families and {len(region)} regions')
species = df['TaxonName'].unique()

print(f'There are {len(species)} species.')
for p in family:
    df_s = df[df['LifeForm'] == p]
    animals = df_s['TaxonName'].unique()
    print(f'{p}[{len(animals)}]', end=', ')


There are 19 families and 268 regions
There are 15615 species.
Algae[518], Amphibians[77], Arthropods p.p. (Myriapods, Diplopods etc.)[139], Bacteria and protozoans[28], Birds[744], Bryophytes[91], Bryozoa[101], Crustaceans[595], Fishes[748], Fungi[326], Insects[3773], Invertebrates (excl. Arthropods, Molluscs)[685], Mammals[169], Molluscs[549], Reptiles[166], Spiders[34], Vascular plants[6734], Viruses[18], nan[0], 

In [7]:
# TODO This is really really really slow. A better way to do it?
# Filter out irrelevant data
spec_iter = np.zeros(len(species), dtype=int)

for idx, s in enumerate(species):
    df_s = df[df['TaxonName'] == s]
    # n_iter = len(df_s)
    spec_iter[idx] = len(df_s)
    # print(f'The species {s} has {spec_iter[idx]} iteractions')

In [8]:
# TODO This is really really really slow. A better way to do it?
# select the species that have less than 3 iteractions
idx = np.where(spec_iter < 3)
# test = df

# Filter the dataset: remove all rows belonging to species with less than 3 iteractions
for s in species[idx]:
    # print(s)  
    df = df.drop(df[df['TaxonName'] == s].index)

print(len(df['TaxonName'].unique()))

2828


In [10]:
# Buld matrix
n_s = len(species)
n_r = len(region)

t_min = df['FirstRecord'].min()
t_max = df['FirstRecord'].max()
time = np.arange(t_min, t_max, 2)

p = n_s + n_r
M = np.zeros((len(time), n_s, n_r))
# M = np.zeros((len(time), p, p)) # if full

for i, t in enumerate(time):
    df_now = df[(df['FirstRecord'] >= t) & (df['FirstRecord'] < t+1)]
    for index, row in df_now.iterrows():
        s = row['TaxonName']
        r = row['Region']
        # print(f'Species {s} invaded region {r} at time {t}')

        s_idx = np.where(species == s)
        r_idx = np.where(region == r)
        M[i, s_idx, r_idx] = 1
M.shape

(35, 15615, 268)

In [13]:
# Export data and then import it into R
print(n_s, n_r)
M = M.reshape(len(time), n_s*n_r)
# M = M.reshape(len(time), (p)**2) # if full matix
print(f's = {n_s}, r = {n_r}')
print(M.shape)
np.save('../data/matrix.npy', M)

15615 268
s = 15615, r = 268
(35, 4184820)


In [None]:
print(f'There are in total {np.sum(M)} iteraction.')

# This was done for really specific case (only viruses)
# We were studying the results of the R algorithm
idx = np.array([5,  8,  9, 13, 17, 18, 20, 24, 25, 29, 30])
idx = idx - 1
most_relevant_spec = np.sum(M[:, 12, idx])
print(f'Of which {most_relevant_spec} are given by species n 12 {species[12]}')x

In [11]:
# TODO we should use sparse matrices for the data

import scipy.sparse

rows = [2, 236, 246, 389, 1691]
cols = [117, 3, 34, 2757, 74, 1635, 52]
prod = [(x, y) for x in rows for y in cols] # combinations
r = [x for (x, y) in prod] # x_coordinate
c = [y for (x, y) in prod] # y_coordinate
data = [1] * len(r)
m = scipy.sparse.coo_matrix((data, (r, c)), shape=(100000, 40000))