# Pre-joining Features to Main Training Datasets

This notebook aims to pre-join the processed sea surface temperature grid and precipitation grid to the dataset of each vessel. This will reduce the computation power and time greatly. (deprecated - migrate to join_df.py and feature_engineering.py)

In [1]:
import pandas as pd
import numpy as np

sst_df = pd.read_csv('../data/sst_grid.csv', index_col=0)
precip_df = pd.read_csv('../data/precip_grid.csv', index_col=0)

sst_df["time_bnds"] = pd.to_datetime(sst_df["time_bnds"]).dt.to_period('M')
precip_df["time"] = pd.to_datetime(precip_df["time"]).dt.to_period('M')

  mask |= (ar1 == a)


In [2]:
vessel = 'trollers'

fishing_df = pd.read_csv('../data/' + vessel + '.csv')
fishing_df = fishing_df[fishing_df['is_fishing'] > -0.5]
fishing_df['is_fishing'] = [0 if x < 0.3 else 1 for x in fishing_df['is_fishing']]
fishing_df = fishing_df[['is_fishing', 'lat', 'lon', 'course', 'speed', 'timestamp', 'distance_from_shore', 'distance_from_port', 'mmsi', 'source']]

def custom_season(x): 
    return np.round(int(x)/3)

def custom_round(x): 
    return 0.5 + np.floor(float(x))

fishing_df['adjust_lat'] = fishing_df['lat'].apply(lambda x: custom_round(x))
fishing_df['adjust_lon'] = fishing_df['lon'].apply(lambda x: custom_round(x))
fishing_df["adjust_time_date"] = pd.to_datetime(fishing_df['timestamp'], unit='s')
fishing_df["adjust_time"] = pd.to_datetime(fishing_df["adjust_time_date"]).dt.to_period('M')
fishing_df["doy"] = fishing_df["adjust_time_date"].dt.dayofyear
fishing_df["dow"] = fishing_df["adjust_time_date"].dt.dayofweek
fishing_df["season"] = fishing_df["adjust_time_date"].dt.month.apply(lambda x: custom_season(x))
fishing_df["year"] = fishing_df["adjust_time_date"].dt.year
fishing_df["month"] = fishing_df["adjust_time_date"].dt.month
fishing_df["day"] = fishing_df["adjust_time_date"].dt.day
fishing_df["hour"] = fishing_df["adjust_time_date"].dt.hour
fishing_df = fishing_df.drop(columns=['timestamp', 'adjust_time_date'])

In [3]:
df_all = pd.merge(fishing_df, sst_df,  how='left', \
                  left_on=['adjust_lat','adjust_lon', 'adjust_time'], \
                  right_on = ['lat','lon', 'time_bnds'])
df_all = pd.merge(df_all, precip_df,  how='left', \
                  left_on=['adjust_lat','adjust_lon', 'adjust_time'], \
                  right_on = ['lat','lon', 'time'])
df_all = df_all.drop(columns=['adjust_time', 'adjust_lon', 'adjust_lat', 'time', 'lat', 'lon', 'lat_y','lon_y', 'time_bnds'])
df_all.to_csv('../data/new_' + vessel + '.csv')
df_all

Unnamed: 0,is_fishing,lat_x,lon_x,course,speed,distance_from_shore,distance_from_port,mmsi,source,doy,dow,season,year,month,day,hour,sst,precip
0,1,38.243591,15.655180,309.200012,3.0,999.975464,11661.617188,7.652701e+13,gfw,127,3,2.0,2015,5,7,13,13.099999,
1,1,38.245487,15.658214,242.199997,8.4,999.975464,13038.084961,7.652701e+13,gfw,127,3,2.0,2015,5,7,13,13.099999,
2,1,38.245098,15.661433,311.200012,7.4,0.000000,12529.657227,7.652701e+13,gfw,127,3,2.0,2015,5,7,13,13.099999,
3,1,38.234615,15.640217,72.900002,6.1,0.000000,9433.749023,7.652701e+13,gfw,127,3,2.0,2015,5,7,14,13.099999,
4,1,38.240871,15.660789,340.399994,6.8,0.000000,12529.657227,7.652701e+13,gfw,127,3,2.0,2015,5,7,14,13.099999,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7840,0,41.267307,36.381233,275.799988,0.0,0.000000,5830.808594,2.740638e+14,gfw,305,6,4.0,2015,11,1,3,16.388000,
7841,0,41.267307,36.381222,12.000000,0.0,0.000000,5830.808594,2.740638e+14,gfw,305,6,4.0,2015,11,1,4,16.388000,
7842,0,41.267334,36.381226,356.500000,0.0,0.000000,5830.808594,2.740638e+14,gfw,305,6,4.0,2015,11,1,4,16.388000,
7843,0,41.267319,36.381226,0.000000,0.0,0.000000,5830.808594,2.740638e+14,gfw,305,6,4.0,2015,11,1,4,16.388000,


In [4]:
df_all.dropna()

Unnamed: 0,is_fishing,lat_x,lon_x,course,speed,distance_from_shore,distance_from_port,mmsi,source,doy,dow,season,year,month,day,hour,sst,precip
3391,0,45.776299,-124.331375,358.000000,7.7,38012.222656,78004.492188,1.259544e+14,gfw,227,5,3.0,2015,8,15,1,22.907,0.000184
3392,0,45.787952,-124.332375,357.299988,7.6,39050.289062,76214.929688,1.259544e+14,gfw,227,5,3.0,2015,8,15,1,22.907,0.000184
3393,0,45.797482,-124.333130,356.100006,7.5,39011.859375,75430.242188,1.259544e+14,gfw,227,5,3.0,2015,8,15,1,22.907,0.000184
3394,0,45.809807,-124.333946,357.200012,7.0,39011.859375,73876.468750,1.259544e+14,gfw,227,5,3.0,2015,8,15,2,22.907,0.000184
3395,0,45.820816,-124.334656,357.299988,7.4,39050.289062,73107.710938,1.259544e+14,gfw,227,5,3.0,2015,8,15,2,22.907,0.000184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6060,1,57.593414,9.962232,295.700012,0.0,0.000000,2236.013184,1.670724e+14,gfw,234,5,3.0,2015,8,22,19,10.531,1.965307
6061,1,57.594055,9.959370,252.699997,4.9,0.000000,2236.013184,1.670724e+14,gfw,234,5,3.0,2015,8,22,20,10.531,1.965307
6062,1,57.592361,9.954581,238.100006,2.4,0.000000,2828.357666,1.670724e+14,gfw,234,5,3.0,2015,8,22,20,10.531,1.965307
6063,1,57.592255,9.954369,241.000000,0.1,0.000000,2828.357666,1.670724e+14,gfw,234,5,3.0,2015,8,22,20,10.531,1.965307
