In [2]:
import pandas as pd

# Load the data from the uploaded CSV file
file_path = 'H:\oceanChallenge2\occ by time and location.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()


Unnamed: 0.1,Unnamed: 0,eventID,scientificName,year,month,day,decimalLatitude,decimalLongitude,start.hour
0,0,MPO-OPANO-BOTTOMTRAWL-1970-09-15T12:25Z,Gadus morhua,1970,9,15,47.983333,-64.716667,12
1,1,MPO-OPANO-BOTTOMTRAWL-1970-09-15T12:25Z,Hippoglossoides platessoides,1970,9,15,47.983333,-64.716667,12
2,2,MPO-OPANO-BOTTOMTRAWL-1970-09-15T12:25Z,Glyptocephalus cynoglossus,1970,9,15,47.983333,-64.716667,12
3,3,MPO-OPANO-BOTTOMTRAWL-1970-09-15T12:25Z,Limanda ferruginea,1970,9,15,47.983333,-64.716667,12
4,4,MPO-OPANO-BOTTOMTRAWL-1970-09-15T12:25Z,Clupea harengus,1970,9,15,47.983333,-64.716667,12


In [3]:
# Counting the frequency of each species and sorting them to find the top 10 most frequent species
species_count = data['scientificName'].value_counts().head(10)
species_count


Hippoglossoides platessoides    5991
Gadus morhua                    5722
Chionoecetes opilio             4369
Strongylocentrotus              3534
Clupea harengus                 2778
Euryalida                       2771
Limanda ferruginea              2681
Amblyraja radiata               2606
Decapoda                        2300
Seaweed                         2278
Name: scientificName, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Filtering the data for the species 'Hippoglossoides platessoides'
hippo_data = data[data['scientificName'] == 'Hippoglossoides platessoides']

# Preparing the features: year, month, decimalLatitude, decimalLongitude
features = hippo_data[['year', 'month', 'decimalLatitude', 'decimalLongitude']]

# The target variable will be the count of occurrences in each group of features
target = hippo_data.groupby(['year', 'month', 'decimalLatitude', 'decimalLongitude']).size()

# Resetting index of the target to merge with the features
target = target.reset_index(name='count')

# Merging features with the target
model_data = target.merge(features, on=['year', 'month', 'decimalLatitude', 'decimalLongitude'], how='left')

# Dropping duplicates that might have been introduced during the merge
model_data = model_data.drop_duplicates()

# Splitting the dataset into training and testing sets
X = model_data[['year', 'month', 'decimalLatitude', 'decimalLongitude']]
y = model_data['count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_regressor.predict(X_test)

# Calculating the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

mse, rmse



(0.04423547297297297, 0.2103223073593787)

In [7]:

# Filtering the data for the species 'Hippoglossoides platessoides'
hippo_data = data[data['scientificName'] == 'Hippoglossoides platessoides']

# The target variable will be the count of occurrences in each group of features
target = hippo_data.groupby(['year', 'month', 'decimalLatitude', 'decimalLongitude']).size()

# Calculating the range and descriptive statistics of the target variable
target_stats = target.describe()
target_min = target_stats['min']
target_max = target_stats['max']
target_mean = target_stats['mean']

target_min, target_max, target_mean, target_stats



(1.0,
 6.0,
 1.0119932432432432,
 count    5920.000000
 mean        1.011993
 std         0.137649
 min         1.000000
 25%         1.000000
 50%         1.000000
 75%         1.000000
 max         6.000000
 dtype: float64)

In [9]:
#预测
# Re-importing necessary libraries and loading the data again
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor



# Filter data for 'Hippoglossoides platessoides'
hippo_data = data[data['scientificName'] == 'Hippoglossoides platessoides']

# Preparing features and target variable
features = hippo_data[['year', 'month', 'decimalLatitude', 'decimalLongitude']]
target = hippo_data.groupby(['year', 'month', 'decimalLatitude', 'decimalLongitude']).size().reset_index(name='count')

# Merge features with target and remove duplicates
model_data = target.merge(features, on=['year', 'month', 'decimalLatitude', 'decimalLongitude'], how='left').drop_duplicates()

# Splitting the data into features (X) and target (y)
X = model_data[['year', 'month', 'decimalLatitude', 'decimalLongitude']]
y = model_data['count']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Selecting a few random samples from the dataset for prediction
sample_data = X.sample(n=5, random_state=1)
sample_data



Unnamed: 0,year,month,decimalLatitude,decimalLongitude
3130,1999,9,46.85175,-60.95575
3805,2004,9,47.650833,-60.55725
2037,1992,9,47.6,-62.216667
234,1974,9,47.966667,-65.416667
1691,1990,9,46.983333,-64.4


In [10]:
# Making predictions using the selected samples
predicted_counts = rf_regressor.predict(sample_data)
predicted_counts_rounded = [round(count) for count in predicted_counts]

# Combining the samples with their predicted occurrence counts
predictions = sample_data.copy()
predictions['Predicted Count'] = predicted_counts_rounded
predictions



Unnamed: 0,year,month,decimalLatitude,decimalLongitude,Predicted Count
3130,1999,9,46.85175,-60.95575,1
3805,2004,9,47.650833,-60.55725,1
2037,1992,9,47.6,-62.216667,1
234,1974,9,47.966667,-65.416667,1
1691,1990,9,46.983333,-64.4,1


In [11]:
import random

# Randomly selecting latitude and longitude for each month of 2024
random.seed(42)  # Setting a seed for reproducibility
random_samples_2024 = []

for month in range(1, 13):  # For each month in the year 2024
    sample = data.sample(n=1, random_state=random.randint(1, 1000))  # Random latitude and longitude
    lat = sample.iloc[0]['decimalLatitude']
    lon = sample.iloc[0]['decimalLongitude']
    random_samples_2024.append([2024, month, lat, lon])

# Convert to DataFrame for prediction
df_samples_2024 = pd.DataFrame(random_samples_2024, columns=['year', 'month', 'decimalLatitude', 'decimalLongitude'])
df_samples_2024



Unnamed: 0,year,month,decimalLatitude,decimalLongitude
0,2024,1,46.633333,-62.633333
1,2024,2,47.001083,-61.911
2,2024,3,48.866667,-63.783333
3,2024,4,47.95,-65.4
4,2024,5,47.434,-64.014833
5,2024,6,48.319917,-62.578083
6,2024,7,46.48775,-62.996333
7,2024,8,45.82725,-62.312083
8,2024,9,48.166667,-63.933333
9,2024,10,47.383333,-62.25
