In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

# File paths
dataset_path = r"D:\workshop 2025\project\datasets\ds3\20180920_Marine_Pollution.xlsx"
save_model_path = r"D:\workshop 2025\project\models\model3\marine_model.pkl"


In [7]:
# Load dataset
df = pd.read_excel(dataset_path)
df.columns = df.columns.str.strip()  # remove extra spaces

# Identify pollution columns that exist in the dataframe
expected_cols = ['Chemicals', 'General garbage', 'Metals', 'Oil spillage and leakages', 'Old fishing gear']
pollution_cols_actual = [col for col in expected_cols if col in df.columns]

# Ensure numeric and fill NaN
df[pollution_cols_actual] = df[pollution_cols_actual].apply(pd.to_numeric, errors='coerce').fillna(0)

# Compute Pollution_Score
df['Pollution_Score'] = df[pollution_cols_actual].sum(axis=1)

# Ensure all scores are numeric
df['Pollution_Score'] = pd.to_numeric(df['Pollution_Score'], errors='coerce').fillna(0)

# Create Risk_Level safely
df['Risk_Level'] = pd.cut(df['Pollution_Score'], bins=[-1,50,150,1e9], labels=['Low','Moderate','High'])

# Drop any rows with NaN in X or y
X = df[pollution_cols_actual]
y = df['Risk_Level']
mask = X.notnull().all(axis=1) & y.notnull()
X = X[mask]
y = y[mask]


In [8]:
# Now train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save model
os.makedirs(os.path.dirname(save_model_path), exist_ok=True)
joblib.dump(rf_model, save_model_path)
print(f"DS3 model trained and saved at: {save_model_path}")


Confusion Matrix:
 [[4]]

Classification Report:
               precision    recall  f1-score   support

         Low       1.00      1.00      1.00         4

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

DS3 model trained and saved at: D:\workshop 2025\project\models\model3\marine_model.pkl


