In [None]:
!pwd

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = pd.read_csv('../raw_data/gbif/occurences_100000.csv')
data.head()

In [None]:
coordinates = data.copy()
coordinates['taxonKey'] = coordinates['taxonKey'].astype('string')

coordinates['coordinates'] = coordinates.apply(lambda row: (row.latitude, row.longitude), axis = 1)
coordinates.head()

In [None]:
grouped = coordinates.groupby(['coordinates'])['taxonKey'].apply(list)

In [None]:
grouped = pd.DataFrame(grouped)
grouped.head()

In [None]:
temp = grouped['taxonKey'].map(lambda x: ' '.join(x))
temp = temp.to_list()

In [None]:
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split())

temp = vectorizer.fit_transform(temp)

In [None]:
temp = pd.DataFrame(temp.toarray(), columns = vectorizer.get_feature_names_out())
temp.head()

In [None]:
coordinates.reset_index(inplace=True, drop = True)
coordinates.head()

In [None]:
merged = coordinates.join(temp)

In [None]:
merged = merged.drop(columns = ['gbifID', 'taxonKey', 'coordinates'])
merged.shape

In [None]:
merged.dtypes

### Testing the function

In [3]:
coordinates = pd.read_csv('../raw_data/gbif/occurences_100000.csv')

In [4]:
# Create a DataFrame with a coordinates column (latitude, longitude)
coordinates['coordinates'] = coordinates.apply(lambda row: (row.latitude, row.longitude), axis = 1)

In [5]:
# Convert taxonKey to string for later vectorizing
coordinates['taxonKey'] = coordinates['taxonKey'].astype('string')
coordinates.shape

(100000, 5)

In [6]:
# Group by coordinates and list the taxonKey's
temp = coordinates.groupby(['coordinates'])['taxonKey'].apply(list)
temp = pd.DataFrame(temp)
temp.head()

Unnamed: 0_level_0,taxonKey
coordinates,Unnamed: 1_level_1
"(47.27662, 10.177038)",[2985949]
"(47.280147, 10.19891)",[2864397]
"(47.285133, 10.157104)",[2820517]
"(47.287818, 10.252126)",[7931979]
"(47.292442, 10.185017)",[7415437]


In [7]:
# Format taxonKey Pandas Series for vectorizing
temp = temp['taxonKey'].map(lambda x: ' '.join(x))
temp = temp.to_list()

In [8]:
# Initialize CountVectorizer and apply it to the taxonKey's
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split())
temp = vectorizer.fit_transform(temp)

In [9]:
temp = pd.DataFrame(temp.toarray(), columns = vectorizer.get_feature_names_out())
temp.head()

Unnamed: 0,10010617,10055129,10055902,10071055,10096320,10178099,10220564,10242289,10356062,10368545,...,9605163,9606396,9624496,9631834,9677963,9689880,9823072,9823314,9823570,9868510
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Merging output of CountVectorizer with latitude and longitude data
coordinates.reset_index(inplace=True, drop = True)
coordinates.shape

(100000, 5)

In [11]:
coordinates = coordinates.drop(columns = ['gbifID', 'taxonKey', 'coordinates'])

In [12]:
merged = coordinates.join(temp)

In [13]:
merged.shape

(100000, 3548)

In [14]:
coordinates.head()

Unnamed: 0,latitude,longitude
0,50.746209,12.543826
1,51.355854,6.327739
2,53.586819,8.963205
3,48.358233,10.949524
4,53.209709,10.39207


In [1]:
!pip freeze

absl-py==1.0.0
affine==2.3.1
aiohttp==3.8.1
aiosignal==1.2.0
alembic==1.4.1
anyio==3.6.1
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
astroid==2.11.5
asttokens==2.0.5
astunparse==1.6.3
async-timeout==4.0.2
attrs==21.4.0
Babel==2.10.1
backcall==0.2.0
beautifulsoup4==4.11.1
-e git+ssh://git@github.com/TmtStss/biodiversipy.git@175090a6224152a4fcb8ccad26b45b68e51c3215#egg=biodiversipy
black==21.12b0
bleach==5.0.0
cachetools==5.2.0
certifi==2022.5.18.1
cffi==1.15.0
charset-normalizer==2.0.12
click==8.1.3
click-plugins==1.1.1
cligj==0.7.2
cloudpickle==2.1.0
commonmark==0.9.1
coverage==6.2
cryptography==37.0.2
cycler==0.11.0
databricks-cli==0.16.6
debugpy==1.6.0
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.5.1
docker==5.0.3
docutils==0.18.1
earthengine-api==0.1.311
entrypoints==0.4
executing==0.8.3
fastjsonschema==2.15.3
flake8==4.0.1
Flask==2.1.2
flatbuffers==2.0
fonttools==4.33.3
frozenlist==1.3.0
fsspec==2022.5.0
future==0.18.2
gast==0.4

In [19]:
check = pd.read_csv('../raw_data/gbif/occurences_1000_encoded.csv')
check.shape

(1000, 507)

In [20]:
check.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,10708887,11071158,2642422,2648701,2650625,2685796,2686212,...,9151957,9172281,9182154,9206251,9220780,9349855,9458333,9483674,9485490,9490132
0,0,50.955611,14.069593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,51.743824,9.997902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,50.42791,7.911031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,50.932696,6.843283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,49.201984,8.114405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
check = pd.read_csv('../raw_data/gbif/coordinates_1000.csv')
check.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude
0,0,50.955611,14.069593
1,1,51.743824,9.997902
2,2,50.42791,7.911031
3,3,50.932696,6.843283
4,4,49.201984,8.114405
