# Something copilot created 

In [1]:
import datetime 
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from utilities import plot_date_distribution, get_region_border

METHOD1_PATH = '/home/anna/msc_oppgave/fish-forecast/correlation_dataset_method1_full_wVMS.csv'
corrrolation_dataset = pd.read_csv(METHOD1_PATH, sep=';', low_memory=False)
env_columns = [col for col in corrrolation_dataset.columns if col.startswith(('bio_', 'phy_'))]
print(f"Loaded correlation dataset: {corrrolation_dataset.shape}")
print(f"Columns: {list(corrrolation_dataset.columns)}")

  from .autonotebook import tqdm as notebook_tqdm


Loaded correlation dataset: (55580, 205)
Columns: ['Date', 'Latitude', 'Longitude', 'HasCatch', 'CatchWeight', 'bio_chl_depth_2.0', 'bio_chl_depth_3.0', 'bio_chl_depth_4.0', 'bio_chl_depth_5.0', 'bio_chl_depth_6.0', 'bio_chl_depth_8.0', 'bio_chl_depth_10.0', 'bio_chl_depth_11.0', 'bio_chl_depth_13.0', 'bio_chl_depth_16.0', 'bio_chl_depth_18.0', 'bio_chl_depth_22.0', 'bio_chl_depth_25.0', 'bio_chl_depth_29.0', 'bio_kd_depth_2.0', 'bio_kd_depth_3.0', 'bio_kd_depth_4.0', 'bio_kd_depth_5.0', 'bio_kd_depth_6.0', 'bio_kd_depth_8.0', 'bio_kd_depth_10.0', 'bio_kd_depth_11.0', 'bio_kd_depth_13.0', 'bio_kd_depth_16.0', 'bio_kd_depth_18.0', 'bio_kd_depth_22.0', 'bio_kd_depth_25.0', 'bio_kd_depth_29.0', 'bio_model_depth_depth_2.0', 'bio_model_depth_depth_3.0', 'bio_model_depth_depth_4.0', 'bio_model_depth_depth_5.0', 'bio_model_depth_depth_6.0', 'bio_model_depth_depth_8.0', 'bio_model_depth_depth_10.0', 'bio_model_depth_depth_11.0', 'bio_model_depth_depth_13.0', 'bio_model_depth_depth_16.0', 'bio_

## Method 1
do for every day\
    find number of catches (N), randomly pick N points within the region making sure that they are (d) distance away from any catch and not on land (i.e. they are valid in copcus ds)\
    for every catch location, extract all variables from phy and bio dataset 

# Random Forrest Classifier




In [2]:
# Machine Learning Feature Importance Analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("=== MACHINE LEARNING FEATURE IMPORTANCE ===")

# Prepare data for ML analysis
# Remove rows with too many missing values
data_clean = corrrolation_dataset[env_columns + ['HasCatch']].copy()

# Remove columns with more than 50% missing values
missing_threshold = 0.5
cols_to_keep = []
for col in env_columns:
    missing_ratio = data_clean[col].isnull().sum() / len(data_clean)
    if missing_ratio < missing_threshold:
        cols_to_keep.append(col)

print(f"Variables kept after removing >50% missing: {len(cols_to_keep)}/{len(env_columns)}")

# Fill remaining missing values with median
data_ml = data_clean[cols_to_keep + ['HasCatch']].copy()
for col in cols_to_keep:
    if data_ml[col].isnull().sum() > 0:
        data_ml[col] = data_ml[col].fillna(data_ml[col].median())

# Remove any remaining rows with missing values
data_ml = data_ml.dropna()
print(f"Final dataset size: {len(data_ml)} samples")

if len(data_ml) > 100:  # Need sufficient data for ML
    X = data_ml[cols_to_keep]
    y = data_ml['HasCatch']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    # Random Forest Feature Importance
    print(f"\n--- Random Forest Feature Importance ---")
    rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
    rf.fit(X_train, y_train)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': cols_to_keep,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 15 Most Important Features (Random Forest):")
    for i, row in feature_importance.head(15).iterrows():
        print(f"{feature_importance.index.get_loc(i)+1:2d}. {row['feature']:<50s} {row['importance']:.4f}")
    
    # Statistical Feature Selection (F-score)
    print(f"\n--- Statistical Feature Selection (F-statistic) ---")
    selector = SelectKBest(score_func=f_classif, k=15)
    X_selected = selector.fit_transform(X_train, y_train)
    
    # Get selected features and their scores
    selected_features = X_train.columns[selector.get_support()]
    feature_scores = selector.scores_[selector.get_support()]
    
    f_score_df = pd.DataFrame({
        'feature': selected_features,
        'f_score': feature_scores
    }).sort_values('f_score', ascending=False)
    
    print("Top 15 Features by F-statistic:")
    for i, row in f_score_df.iterrows():
        print(f"{f_score_df.index.get_loc(i)+1:2d}. {row['feature']:<50s} {row['f_score']:.2f}")
    
    # Model performance
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    print(f"\nRandom Forest Performance:")
    print(f"Training Accuracy: {train_score:.3f}")
    print(f"Testing Accuracy:  {test_score:.3f}")
    
else:
    print("Insufficient data for machine learning analysis")

=== MACHINE LEARNING FEATURE IMPORTANCE ===
Variables kept after removing >50% missing: 186/200
Final dataset size: 55580 samples

--- Random Forest Feature Importance ---
Top 15 Most Important Features (Random Forest):
 1. phy_thetao_depth_25.211410522460938                0.0284
 2. phy_thetao_depth_15.810070037841797                0.0276
 3. phy_thetao_depth_29.444730758666992                0.0276
 4. phy_thetao_depth_21.598819732666016                0.0212
 5. phy_thetao_depth_11.404999732971191                0.0201
 6. phy_thetao_depth_7.92956018447876                  0.0168
 7. phy_thetao_depth_0.49402499198913574               0.0166
 8. phy_thetao_depth_18.495559692382812                0.0164
 9. bio_o2_depth_29.0                                  0.0134
10. phy_thetao_depth_9.572997093200684                 0.0126
11. bio_o2_depth_25.0                                  0.0117
12. phy_thetao_depth_13.467140197753906                0.0115
13. phy_so_depth_25.211410522460938 