In [17]:
import pandas as pd 
import numpy as np
from os import path
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
!pwd

/Users/mikkelvaldemarkoch/code/MikkelValdemar/biodiversipy/notebooks


In [10]:
raw_data_path = path.join('..', 'raw_data')
source_path = path.join(raw_data_path, 'gbif', csv)

In [36]:
coords_germany = {
    'lon_lower': 5.7,
    'lat_lower': 47.1,
    'lon_upper': 15.4,
    'lat_upper': 55.1}

In [33]:
def clean_occurences(csv='germany.csv', n = 0, coords=False):
    """Cleans a csv as downloaded from GBIF. Samples n rows. Outputs 2 csv files (occurences and metadata)."""
    source_path = path.join(raw_data_path, 'gbif', csv)

    print('Hi')
    # Load data into pd.DataFrame
    data = pd.read_csv(source_path, sep = '\t', low_memory = False)

    # Keep useful columns
    selected_columns = ['gbifID', 'datasetKey', 'kingdom', 'phylum', 'class','order', 'family', 
                        'genus', 'species', 'scientificName', 'decimalLatitude', 'decimalLongitude', 
                        'day', 'month', 'year', 'taxonKey', 'license']
    
    data_cleaned = data[selected_columns]

    # Drop duplicates based on lat, lon, taxonKey
    data_cleaned = data_cleaned.drop_duplicates(subset = ['decimalLatitude', 'decimalLongitude', 'taxonKey'], keep = 'first')

    # Rename coordinates column
    data_cleaned = data_cleaned.rename(columns = {'decimalLatitude': 'latitude', 'decimalLongitude': 'longitude'})

    print('Hi')
    # Drop observations outside the bounding box coordinates of Germany
    if coords:
        mask = (data_cleaned['latitude'] >= coords['lat_lower']) & \
               (data_cleaned['latitude'] <= coords['lat_upper']) & \
               (data_cleaned['longitude'] >= coords['lon_lower']) & \
               (data_cleaned['longitude'] <= coords['lon_upper'])

        data_cleaned = data_cleaned[mask]
    
    print('Hi')
    # Sample n rows
    suffix = ''
    if n:
        data_cleaned = data_cleaned.sample(n)
        suffix = '_' + str(n)

    # Splitting occurences data and metadata
    gbifID = ['gbifID']
    taxonKey = ['taxonKey']
    coordinates = ['latitude', 'longitude']
    data_final = data_cleaned[gbifID + coordinates + taxonKey]
    metadata = data_cleaned.drop(columns = coordinates)
    print('Hi')
    # Create output directory
    output_path = path.join(raw_data_path,'gbif', 'occurrences' + suffix)
    if not path.isdir(output_path):
        os.mkdir(output_path)
    
    # Write occurences csv
    filename = 'occurrences' + suffix + '.csv'
    destination_path = path.join(output_path, filename)
    data_final.to_csv(destination_path, index=False)

    # Write metadata csv
    filename = 'metadata' + suffix + '.csv'
    destination_path = path.join(output_path, filename)
    metadata.to_csv(destination_path, index=False)
    return data_final, metadata


In [37]:
final, meta = clean_occurences(csv='germany.csv', n = 1000, coords=coords_germany)
final

Hi


KeyboardInterrupt: 

In [18]:
def get_suffix(n):
    if n < 1_000:
        suffix = '_' + str(n)
    elif (n >= 1_000) and (n < 1_000_000):
        suffix = '_' + str(n // 1_000) + 'k'
    else:
        suffix = '_' + str(n // 1_000_000) + 'm'
    return suffix

In [136]:
def encode_taxonKey(raw_data_path, n, from_csv = True, to_csv = True):
    """
    Takes an occurence DataFrame or 'occurences_n.csv' as input and outputs
    the species encoded and the unique location coordinates as DataFrame or
    csv ('occurences_n_encoded.csv', 'coordinates_n.csv')
    """
    print('Starting')
    filename = 'occurrences' + get_suffix(n) + '.csv'
    source_path = path.join(raw_data_path, 'gbif', 'occurrences' + get_suffix(n), filename)

    if from_csv:
        coordinates = pd.read_csv(source_path)
    else:
        coordinates = pd.DataFrame(source_path)
        
    

    print('Create coordinates')
    # Create a DataFrame with a coordinates column (latitude, longitude)
    coordinates['coordinates'] = coordinates[['latitude', 'longitude']].apply(tuple, axis=1)

    # Convert taxonKey to string for later vectorizing
    coordinates['taxonKey'] = coordinates['taxonKey'].astype('string')
    
    print('GroupBy')
    # Group by coordinates and list the taxonKey's
    encoded_targets = coordinates.groupby(['coordinates'])['taxonKey'].apply(list)
    encoded_targets = pd.DataFrame(encoded_targets)
    idx = encoded_targets.index

    # Format taxonKey Pandas Series for vectorizing
    encoded_targets['taxonKey'] = encoded_targets['taxonKey'].map(lambda x: ' '.join(x))
    #encoded_targets = encoded_targets.to_list()

    print('Tokenize')
    # Initialize CountVectorizer and apply it to the taxonKey's
    vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split())
    encoded_targets = vectorizer.fit_transform(encoded_targets['taxonKey']).toarray()
    
    print('Feature names')
    # Get feature names out
    encoded_targets = pd.DataFrame(encoded_targets, index=idx, columns = vectorizer.get_feature_names_out())
    encoded_targets.reset_index(inplace=True)

    print('Merging')
    # Merging output of CountVectorizer with latitude and longitude data
    #coordinates = coordinates.drop(columns = ['gbifID', 'taxonKey', 'coordinates'])
    coordinates = coordinates.drop(columns=['gbifID', 'taxonKey']).drop_duplicates()
    
    merged = coordinates.merge(encoded_targets).drop(columns='coordinates')

    if to_csv:
        encoded_path = source_path.replace('.csv', '_encoded.csv')
        merged.to_csv(encoded_path, index = False)
        coordinates_path = source_path.replace('occurences', 'coordinates')
        coordinates.to_csv(coordinates_path, index = False)

    return merged, coordinates

In [20]:
n = 1000
raw_data_path = path.join('..', 'raw_data')

In [24]:
filename = 'occurrences' + get_suffix(n) + '.csv'
source_path = path.join(raw_data_path, 'gbif', 'occurrences' + get_suffix(n), filename)
source_path

'../raw_data/gbif/occurrences_1k/occurrences_1k.csv'

In [25]:
coordinates = pd.read_csv(source_path)
coordinates

Unnamed: 0,latitude,longitude
0,48.870487,10.335366
1,54.651142,9.772167
2,53.936638,11.247078
3,49.667747,8.651412
4,48.826295,10.062125
...,...,...
995,52.047150,13.316376
996,48.809866,12.882002
997,48.238122,11.506422
998,47.831425,7.731156
