# Fishing pressure data preparation  
The data for this widget comes from this paper, and is retrieved from 2 datasets on the cloud bucket: one for the % of surface withing each of the buckets (which will feed the plot), and another for the median and range values (for the sentence).  

Data model is as follows:
- `location_id`  
- `indicator`
- `category`
- `value`
- `year`

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
import logging
from pathlib import Path
import geopandas as gpd



  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#  FIXME: This will depends from where the notebook kernel is running so be careful
WORK_DIR =Path(os.getcwd())
BASE_DIR = f'{WORK_DIR.parents[3]}'
logging.basicConfig(level=logging.INFO)

# @TODO: Add expected data files source as an environment variable. and add the download bit for the data sources.
assert BASE_DIR == '/home/jovyan/work', f'{BASE_DIR} is not the correct directory'
outFolder= Path(f'{BASE_DIR}/datasets')
outFolder



PosixPath('/home/jovyan/work/datasets')

## 1) Load datasets

In [35]:
gcs_bucket = 'https://storage.googleapis.com/mangrove_atlas/widget_data'
df_buckets = pd.read_csv(f'{gcs_bucket}/Fisher_Country.csv')
#df_buckets = pd.read_csv(f'{outFolder}/Fisher_Country.csv')
df_buckets.columns = df_buckets.columns.str.lower()
df_buckets = df_buckets[['gid_0','q1','q2','q3','q4','q5']]
df_buckets.columns = ['gid_0', '0 - 50','>50 - 200', '>200 - 700', '>700 - 2000', '>2000']
df_buckets.head()

Unnamed: 0,gid_0,0 - 50,>50 - 200,>200 - 700,>700 - 2000,>2000
0,ABW,0,0,0,0,100
1,AGO,0,0,7,63,29
2,AIA,0,0,0,100,0
3,ARE,31,56,11,2,0
4,ASM,0,0,0,100,0


In [36]:
df_median = pd.read_csv(f'{gcs_bucket}/Mangrove_Fishers_Median.csv')
df_median.columns = df_median.columns.str.lower()
df_median.columns = ['country','pct_mangrove_fishers', 'median', 'range']
df_median = df_median[df_median['median'].notna()]
df_median['median'] = df_median['median'].str.replace('<', '').astype(int)
df_median['range'] = df_median['range'].str.replace('04/25', '4-7125') #Fix Colombia data
df_median[['range_min', 'range_max']] = df_median['range'].str.split('-', expand=True)
df_median.drop(columns=['range'], inplace=True)
#df_median['range_min'] = df_median['range_min'].astype(int)
#df_median['range_max'] = df_median['range_max'].astype(int)
df_median.head()

Unnamed: 0,country,pct_mangrove_fishers,median,range_min,range_max
0,Angola,42.0,3874,218,360365.0
1,Australia,24.0,1,0,11017.0
2,Bangladesh,82.0,2519,419,
3,Belize,77.0,98,1,2943870.0
5,Brazil,53.0,702,0,67641.0


## 2) Process data format  
Transform data to long format.

In [37]:
df_long_buckets = df_buckets.melt(id_vars=['gid_0'], var_name='category', value_name='value')
df_long_buckets.head()

Unnamed: 0,gid_0,category,value
0,ABW,0 - 50,0
1,AGO,0 - 50,0
2,AIA,0 - 50,0
3,ARE,0 - 50,31
4,ASM,0 - 50,0


In [38]:
#Explore proportions
df_long_buckets.groupby(['category']).agg({'value': ['mean','min', 'max']})

Unnamed: 0_level_0,value,value,value
Unnamed: 0_level_1,mean,min,max
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0 - 50,16.045872,0,100
>200 - 700,18.541284,0,100
>2000,28.587156,0,100
>50 - 200,15.495413,0,85
>700 - 2000,21.311927,0,100


In [39]:
df_median_long = df_median.melt(id_vars=['country'], var_name='category', value_name='value')
df_median_long = df_median_long[df_median_long['value'].notna()]
df_median_long['value'] = df_median_long['value'].astype(int)
df_median_long.head()

Unnamed: 0,country,category,value
0,Angola,pct_mangrove_fishers,42
1,Australia,pct_mangrove_fishers,24
2,Bangladesh,pct_mangrove_fishers,82
3,Belize,pct_mangrove_fishers,77
4,Brazil,pct_mangrove_fishers,53


## 3) Add locations

In [14]:
locations_file = 'https://storage.googleapis.com/mangrove_atlas/boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg'
locations = gpd.read_file(locations_file)
locations = locations[locations['type'] == 'country']
locations.head()

Unnamed: 0,name,iso,type,area_m2,wdpaid,globalid,perimeter_m,location_idn,coast_length_m,geometry
82,Qatar,QAT,country,3.880224,,{AF97ABE2-6405-4438-A7ED-1494A43DA379},8.392644,06d2e6f9-bc89-59bf-a0e2-ab804e5db9fd,1345769.96,"MULTIPOLYGON (((50.73769 24.93464, 50.73779 24..."
89,Mayotte,MYT,country,5.611808,,{57E86B5B-7EF0-4754-A8D4-A9DC3212D421},10.086238,0750953f-4af9-549b-aeea-329663249a56,291036.71,"POLYGON ((46.63483 -12.96039, 46.63197 -12.969..."
118,Vietnam,VNM,country,90.156489,,{B2A84FBB-34CD-4A51-9463-B9DB2DB62A10},81.714911,09a1ab14-11ad-56ec-8acb-a149e5697abd,9005760.08,"MULTIPOLYGON (((104.31952 10.36051, 104.31975 ..."
132,Grenada,GRD,country,2.154728,,{F8753179-5FFA-4D9E-8AD9-083F31C48528},6.743601,0b0ecb56-bb8e-5ef1-b8ee-3cdad67fed0e,260664.47,"MULTIPOLYGON (((-61.91525 11.37330, -61.91813 ..."
149,India,IND,country,473.029671,,{A4A6CE4D-8D03-4246-9A2F-BD9811232115},211.564078,0c07ca53-7b17-5650-a2c6-0cc27249a4bd,16917891.22,"MULTIPOLYGON (((79.52922 9.38411, 79.52921 9.3..."


In [15]:
api_locs = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/locations_staging.csv')
api_locs.rename(columns={'location_id': 'location_idn'}, inplace=True)
api_locs.head()

Unnamed: 0,id,location_idn
0,1563,000bd204-c0fd-510b-a1ad-132a7ef7470d
1,1564,00250a0f-f66d-54a0-b7a3-d80035881cbf
2,1565,0041637b-f6a2-5b89-87ce-850f5c5431b3
3,1566,005b49ef-6b7f-575a-85b3-ff19261a0755
4,1567,00921349-70fb-5a7e-8207-b3157aecc349


### For the buckets data

In [40]:
df_buckets_locs = df_long_buckets.merge(locations[['location_idn', 'iso']], left_on='gid_0', right_on='iso', how='left')
df_buckets_locs = df_buckets_locs.merge(api_locs, on='location_idn', how='left')
df_buckets_locs = df_buckets_locs[['id', 'category', 'value']]
df_buckets_locs.sample(5)

Unnamed: 0,id,category,value
14,3770,0 - 50,0
158,3844,>50 - 200,14
260,2710,>200 - 700,0
349,2664,>700 - 2000,24
417,2292,>700 - 2000,5


### For the median / range data  
Some locations' names need to be reviewed.

In [41]:
df_median_long[~df_median_long['country'].isin(locations['name'])]['country'].unique()

array(['The Gambia', "Cote d'Ivoire", 'Mexico', 'The Bahamas'],
      dtype=object)

In [42]:
pattern = 'Bah'
locations[locations['name'].str.contains(pattern)]

Unnamed: 0,name,iso,type,area_m2,wdpaid,globalid,perimeter_m,location_idn,coast_length_m,geometry
2000,Bahamas,BHS,country,56.695346,,{560D8F33-240F-4741-B311-68C4414BD842},33.498041,a0d0a60d-1c43-5709-9d80-4b7376421c1d,16215044.98,"POLYGON ((-72.66451 21.67102, -72.66469 21.670..."
2996,Bahrain,BHR,country,0.74771,,{162FB878-D279-46F0-AA36-1576B037F6CD},4.140048,f309afe5-27b5-575a-aa2c-7598a53dffa4,835126.13,"MULTIPOLYGON (((50.73625 25.55955, 50.73696 25..."


In [43]:
df_median_long.loc[df_median_long['country'] == 'The Gambia', 'country'] = 'Gambia'
df_median_long.loc[df_median_long['country'] ==  "Cote d'Ivoire", 'country'] = "Côte d'Ivoire"
df_median_long.loc[df_median_long['country'] == 'Mexico', 'country'] = 'México'
df_median_long.loc[df_median_long['country'] == 'The Bahamas', 'country'] = 'Bahamas'

In [44]:
df_median_long[~df_median_long['country'].isin(locations['name'])]['country'].unique()

array([], dtype=object)

In [45]:
df_median_locs = df_median_long.merge(locations[['location_idn', 'name']], left_on='country', right_on='name', how='left')
df_median_locs = df_median_locs.merge(api_locs, on='location_idn', how='left')
df_median_locs = df_median_locs[['id', 'category', 'value']]
df_median_locs.sample(5)

Unnamed: 0,id,category,value
193,4543,range_min,325
14,3457,pct_mangrove_fishers,21
128,2327,median,1413
20,3400,pct_mangrove_fishers,87
232,3457,range_max,4421


## 4) Final processing and save

In [46]:
df_median_locs = df_median_locs[df_median_locs['category'] != 'pct_mangrove_fishers']
df_median_locs = df_median_locs[df_median_locs['value'].notna()]
df_median_locs.head()

Unnamed: 0,id,category,value
73,2029,median,3874
74,2441,median,1
75,4587,median,2519
76,3831,median,98
77,1985,median,702


In [47]:
df_final = pd.concat([df_buckets_locs, df_median_locs])
df_final['indicator'] = 'fishing_pressure'
df_final['year'] = 2020
df_final.rename(columns={'id': 'location_id'}, inplace=True)
df_final

Unnamed: 0,location_id,category,value,indicator,year
0,2707,0 - 50,0,fishing_pressure,2020
1,2029,0 - 50,0,fishing_pressure,2020
2,1915,0 - 50,0,fishing_pressure,2020
3,3123,0 - 50,31,fishing_pressure,2020
4,2346,0 - 50,0,fishing_pressure,2020
...,...,...,...,...,...
286,3733,range_max,28226,fishing_pressure,2020
287,3248,range_max,1166,fishing_pressure,2020
288,3582,range_max,16809,fishing_pressure,2020
289,1681,range_max,141533,fishing_pressure,2020


In [48]:
df_final.dtypes

location_id     int64
category       object
value           int64
indicator      object
year            int64
dtype: object

In [49]:
df_final['category'].value_counts()

0 - 50         109
>50 - 200      109
>200 - 700     109
>700 - 2000    109
>2000          109
median          73
range_min       73
range_max       72
Name: category, dtype: int64

In [51]:
df_final[df_final['location_id'].isna()]

Unnamed: 0,location_id,category,value,indicator,year


In [50]:
df_final.to_csv(f'{outFolder}/fishing_pressure_data_upload.csv', index=False)