## Load the Data

In [9]:
!pip install pandas
import pandas as pd 
import os

# function to load data
def load_data(data_path, file_name):
    csv_path = os.path.join(data_path, file_name)
    return pd.read_csv(csv_path, low_memory=False)

ocean = load_data("data", "Data_Level5_BAH_OceanCleanup.csv")

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/99/b0/756e52f6582cade5e746f19bad0517ff27ba9c73404607c0306585c201b3/pandas-2.3.2-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading pandas-2.3.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting numpy>=1.26.0 (from pandas)
  Obtaining dependency information for numpy>=1.26.0 from https://files.pythonhosted.org/packages/22/f2/07bb754eb2ede9073f4054f7c0286b0d9d2e23982e090a80d478b26d35ca/numpy-2.3.3-cp312-cp312-macosx_14_0_arm64.whl.metadata
  Downloading numpy-2.3.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Collecting pytz>=2020.1 (from 

In [10]:
ocean['Cleanup Type'].value_counts()

Cleanup Type
Land (beach, shoreline and inland)                  37116
Watercraft (powerboat, sailboat, kayak or canoe)      571
Underwater                                            216
Name: count, dtype: int64

## Seperate Training and the Test Set

In [11]:
# drop nan
ocean.dropna(subset=["Cleanup Type"], inplace=True)
ocean.dropna(subset=["Zone"], inplace=True)

# seperate labels and predicators
X = ocean.drop(['Zone', 'Cleanup ID', 'State', 'Country', 'GPS', 'Cleanup Type',
                       'Cleanup Date', 'Group Name'], axis=1) 
y = ocean["Cleanup Type"].copy()

In [14]:
!pip install scikit-learn
from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(len(X_train))
print(len(X_test))

print(len(y_train))
print(len(y_test))

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/43/5d/779320063e88af9c4a7c2cf463ff11c21ac9c8bd730c4a294b0000b666c9/scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.8.0 from https://files.pythonhosted.org/packages/91/4d/281fddc3d80fd738ba86fd3aed9202331180b01e2c78eaae0642f22f7e83/scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl.metadata
  Downloading scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.org/packages/1e/e8/685f47e0d75432

## ML Pipeline

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# numerical values pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])


# prepare the data
X_train = num_pipeline.fit_transform(X_train)
X_test = num_pipeline.transform(X_test)

In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

## Train Models and Evaluate on the Training Set¶

In [17]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# function to print out classification model report
def classification_report(model_name, test, pred, label="1"):
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.4f}'.format(accuracy_score(test, pred)))
    print("     Precision: ", '{:,.4f}'.format(precision_score(test, pred, pos_label=label, average='weighted')))
    print("        Recall: ", '{:,.4f}'.format(recall_score(test, pred, pos_label=label, average='weighted')))
    print("      F1 score: ", '{:,.4f}'.format(f1_score(test, pred, pos_label=label, average='weighted')))


## Multiclass Classification with KNN classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knnc = KNeighborsClassifier(weights='distance', n_neighbors=4)
knnc.fit(X_train, y_train)

0,1,2
,n_neighbors,4
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [19]:
y_knn_pred = knnc.predict(X_test)
classification_report("Test data - KNN classifier report, Clean-up Type", y_test, y_knn_pred, "Underwater")

Test data - KNN classifier report, Clean-up Type :

Accuracy Score:  0.9796
     Precision:  0.9707
        Recall:  0.9796
      F1 score:  0.9727


