In [2]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import glob

# --- Steps 1 & 2: Load, Clean, and Sort ---
print("Loading and cleaning data...")
try:
    csv_files = glob.glob('Dataset/*.csv')
    if not csv_files:
        print("Error: No CSV files found in the current directory.")
    else:
        df_list = [pd.read_csv(file, header=None, names=['id', 'date', 'lc', 'lon', 'lat']) for file in csv_files]
        df = pd.concat(df_list, ignore_index=True)

        df.dropna(subset=['lat', 'lon'], inplace=True)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df.dropna(subset=['date'], inplace=True)
        df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
        df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
        df.dropna(subset=['lat', 'lon'], inplace=True)
        df.sort_values(by=['id', 'date'], inplace=True)
        df.reset_index(drop=True, inplace=True)
        print("Data loaded and cleaned.")

        # --- Combined Feature Creation Function ---
        def create_all_features(group):
            prev_lat = group['lat'].shift(1)
            prev_lon = group['lon'].shift(1)
            prev_date = group['date'].shift(1)
            
            distances = [great_circle((lat, lon), (prev_lat, prev_lon)).meters if not pd.isna(prev_lat) else np.nan
                         for lat, lon, prev_lat, prev_lon in zip(group['lat'], group['lon'], prev_lat, prev_lon)]
            group['distance_m'] = distances
            
            time_diff_s = (group['date'] - prev_date).dt.total_seconds()
            group['speed_mps'] = group['distance_m'] / time_diff_s.replace(0, np.nan)

            lat1, lon1 = np.radians(prev_lat), np.radians(prev_lon)
            lat2, lon2 = np.radians(group['lat']), np.radians(group['lon'])
            dLon = lon2 - lon1
            y = np.sin(dLon) * np.cos(lat2)
            x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dLon)
            bearing = np.degrees(np.arctan2(y, x))
            group['bearing'] = (bearing + 360) % 360
            group['turning_angle'] = abs(group['bearing'].diff())

            window_size = 5
            group['speed_avg_roll'] = group['speed_mps'].rolling(window=window_size, min_periods=1).mean()
            group['speed_std_roll'] = group['speed_mps'].rolling(window=window_size, min_periods=1).std()
            group['angle_avg_roll'] = group['turning_angle'].rolling(window=window_size, min_periods=1).mean()
            group['angle_std_roll'] = group['turning_angle'].rolling(window=window_size, min_periods=1).std()
            
            return group

        # --- Apply the single function ---
        print("Calculating all features...")
        df = df.groupby('id').apply(create_all_features)
        print("All features calculated successfully.")

        # --- Define Target and Create Final DataFrame ---
        speed_threshold = 0.5 
        angle_threshold = 20
        df['is_foraging'] = ((df['speed_mps'] < speed_threshold) & (df['turning_angle'] > angle_threshold)).astype(int)

        df['month'] = df['date'].dt.month
        df['hour'] = df['date'].dt.hour

        # Define final columns
        final_columns = [
            'id',
            'date',
            'lat', 'lon', 'month', 'hour', 
            'speed_avg_roll', 'speed_std_roll', 'angle_avg_roll', 'angle_std_roll',
            'is_foraging'
        ]
        
        # --- THE FIX IS HERE ---
        # First, drop the old 'id' column which is now redundant
        if 'id' in df.columns:
            df = df.drop('id', axis=1)
        
        # Now, reset the index to turn the 'id' from the index back into a column
        df.reset_index(inplace=True)

        final_df = df.dropna(subset=final_columns)
        final_df = final_df[final_columns]

        print("\nData preparation complete!")
        print("New DataFrame includes 'id' and 'date' for sequencing:")
        print(final_df.head())

        final_df.to_csv('processed_shark_data.csv', index=False)
        print("\n✅ 'processed_shark_data.csv' has been created successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

Loading and cleaning data...
Data loaded and cleaned.
Calculating all features...
All features calculated successfully.

Data preparation complete!
New DataFrame includes 'id' and 'date' for sequencing:
                   id                date        lat        lon  month  hour  \
3  160424_2013_132346 2013-07-29 16:38:58  40.605000 -71.742004      7    16   
4  160424_2013_132346 2013-07-29 16:49:22  40.548000 -71.621002      7    16   
5  160424_2013_132346 2013-07-29 17:39:59  40.544998 -71.540009      7    17   
6  160424_2013_132346 2013-07-29 19:18:58  40.513000 -71.506989      7    19   
7  160424_2013_132346 2013-07-29 19:53:41  40.516998 -71.585999      7    19   

   speed_avg_roll  speed_std_roll  angle_avg_roll  angle_std_roll  is_foraging  
3        1.468594        1.743751      160.152236      106.012042            0  
4        5.919279        9.014518      127.892000       93.495695            0  
5        5.186637        7.976840      103.169286       90.953177        

  df['date'] = pd.to_datetime(df['date'], errors='coerce')
  df = df.groupby('id').apply(create_all_features)


In [7]:
import pandas as pd
import numpy as np
import joblib
import geojson
import geopandas as gpd
from shapely.geometry import Point

# 1. Load your best model and the scaler
print("Loading the saved model and scaler...")
model = joblib.load('best_shark_model_lightgbm.pkl')
scaler = joblib.load('shark_model_scaler.pkl')
print("Model and scaler loaded successfully.")

# Load the land shapefile
print("Loading coastline data...")
land_gdf = gpd.read_file('Land/ne_50m_land.shp')
land_polygon = land_gdf.unary_union
print("Coastline data loaded.")

# 2. Create a Grid for the North Atlantic
print("Creating prediction grid...")
min_lon, max_lon = -80, -60
min_lat, max_lat = 30, 50
grid_resolution = 0.5

lons = np.arange(min_lon, max_lon, grid_resolution)
lats = np.arange(min_lat, max_lat, grid_resolution)
lon_grid, lat_grid = np.meshgrid(lons, lats)

grid_df = pd.DataFrame({
    'lon': lon_grid.flatten(),
    'lat': lat_grid.flatten()
})

# --- CORRECTED FILTERING LOGIC ---
print(f"Filtering grid points... Starting with {len(grid_df)} points.")
# Create a GeoDataFrame from our grid points
geometry = [Point(xy) for xy in zip(grid_df['lon'], grid_df['lat'])]
grid_gdf = gpd.GeoDataFrame(grid_df, geometry=geometry)

# Filter the GeoDataFrame to keep only points NOT on land
ocean_gdf = grid_gdf[~grid_gdf.geometry.within(land_polygon)]

# Create a new, clean DataFrame from the filtered ocean points, dropping the geometry
grid_df = pd.DataFrame(ocean_gdf.drop(columns='geometry'))
print(f"Finished filtering. {len(grid_df)} ocean points remaining.")
# --- END OF FIX ---

# 3. Engineer features for the remaining ocean points
print("Engineering features for the grid...")
month = 8
hour = 22

grid_df['month'] = month
grid_df['hour'] = hour
grid_df['hour_sin'] = np.sin(2 * np.pi * hour/24.0)
grid_df['hour_cos'] = np.cos(2 * np.pi * hour/24.0)
grid_df['month_sin'] = np.sin(2 * np.pi * month/12.0)
grid_df['month_cos'] = np.cos(2 * np.pi * month/12.0)

placeholder_features = [
    'speed_avg_roll', 'speed_std_roll', 'angle_avg_roll', 'angle_std_roll',
    'speed_percentile', 'is_slow', 'speed_change', 'high_turn_angle',
    'angle_consistency', 'speed_angle_ratio', 'movement_efficiency',
    'turning_intensity', 'lat_rounded', 'lon_rounded', 'distance_from_center',
    'dawn_dusk', 'night', 'speed_z_score', 'angle_z_score'
]
for feat in placeholder_features:
    grid_df[feat] = 0

grid_df['lat_rounded'] = np.round(grid_df['lat'], 2)
grid_df['lon_rounded'] = np.round(grid_df['lon'], 2)
grid_df['distance_from_center'] = np.sqrt((grid_df['lat'] - 40)**2 + (grid_df['lon'] - (-70))**2)

# 4. Scale and Predict
print("Scaling grid features...")
X_train_columns = scaler.feature_names_in_
grid_df = grid_df[X_train_columns]
grid_scaled = scaler.transform(grid_df)

print("Predicting foraging probability on the grid...")
probabilities = model.predict_proba(grid_scaled)[:, 1]
grid_df['probability'] = probabilities
print("Prediction complete.")

# 5. Export to GeoJSON
print("Exporting results to hotspots.geojson...")
features = []
for i, row in grid_df.iterrows():
    if row['probability'] > 0.5:
        point = geojson.Point((row['lon'], row['lat']))
        feature = geojson.Feature(geometry=point, properties={'probability': round(float(row['probability']), 4)})
        features.append(feature)

feature_collection = geojson.FeatureCollection(features)
with open('hotspots.geojson', 'w') as f:
    geojson.dump(feature_collection, f)

print("\n✅ Operation Complete! Your final 'hotspots.geojson' now only contains ocean points.")

Loading the saved model and scaler...
Model and scaler loaded successfully.
Loading coastline data...


  land_polygon = land_gdf.unary_union


Coastline data loaded.
Creating prediction grid...
Filtering grid points... Starting with 1600 points.
Finished filtering. 996 ocean points remaining.
Engineering features for the grid...
Scaling grid features...
Predicting foraging probability on the grid...




Prediction complete.
Exporting results to hotspots.geojson...

✅ Operation Complete! Your final 'hotspots.geojson' now only contains ocean points.
