## <u>Use case 1:</u> Forecasting Cleanup Types Based on Latitude,Longitude, Distance, Weight, and Trash Levels

## <u>Model 1:</u> Decision Tree

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Latitude, Longitude, and all of the trash types are included to determine the cleanup type
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = dt_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)


# Precision, Recall, and F1 Score, expressed as a percentage
precision = precision_score(y_test, y_pred, average='weighted') * 100  # Multiplied by 100 to convert to percentage
recall = recall_score(y_test, y_pred, average='weighted') * 100
f1 = f1_score(y_test, y_pred, average='weighted') * 100

# Print the evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
training_accuracy = accuracy_score(y_train, dt_model.predict(X_train_scaled)) * 100
validation_accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Training Accuracy: {training_accuracy:.2f}%")
print(f"Test Accuracy: {validation_accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1:.2f}%")

Accuracy: 96.76%
Training Accuracy: 100.00%
Test Accuracy: 96.76%
Precision: 96.76%
Recall: 96.76%
F1 Score: 96.75%


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'cleaned_ocean.csv'
data = pd.read_csv(file_path)

# Define the feature set including the specified additional features
feature_columns =['Latitude', 'Longitude','Miles','Pounds','Trash level']

# Prepare the features and target
X = data[feature_columns]  # Features including additional specified features
y = data['Cleanup Type']  # Target

# Drop rows with missing values
X = X.dropna()
y = y[X.index]  # Ensure the target aligns with the feature set

# Encode the Cleanup Type labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=20)

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier
dt_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Function to predict the cleanup type based on given latitude and longitude
# Function to predict the cleanup type based on given features
def predict_cleanup_type(features, model, label_encoder):
    # Ensure that the input features match the training data's features
    if len(features) != 5:
        raise ValueError(f"Expected 5 features, but received {len(features)}")
    
    # Create a DataFrame with the input features
    input_features = pd.DataFrame([features], columns=X.columns)
    
    # Use the model to predict the cleanup type
    prediction = model.predict(input_features)
    
    # Convert the numerical label back to the original cleanup type name
    predicted_cleanup_type_name = label_encoder.inverse_transform(prediction)
    
    return predicted_cleanup_type_name[0]

# Example features
# Replace with actual feature values
example_features = [47.639629, -122.333268,1, 26, 2]  # Include all 50 feature values here

# Predict the cleanup type for the provided features
predicted_cleanup_type = predict_cleanup_type(example_features, dt_classifier, label_encoder)

# Print out the results
print(f"Predicted cleanup type: {predicted_cleanup_type}")


Predicted cleanup type: Land (beach, shoreline and inland)


___

## <u>Model 2:</u> Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=20)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
training_accuracy = accuracy_score(y_train, rf_model.predict(X_train_scaled)) * 100
validation_accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred, average='weighted') * 100
recall = recall_score(y_test, y_pred, average='weighted') * 100
f1 = f1_score(y_test, y_pred, average='weighted') * 100

# Print the evaluation metrics in percentage format
print(f"Accuracy (Random Forest): {accuracy:.2f}%")
print(f"Training Accuracy (Random Forest): {training_accuracy:.2f}%")
print(f"Test Accuracy (Random Forest): {validation_accuracy:.2f}%")
print(f"Precision (Random Forest): {precision:.2f}%")
print(f"Recall (Random Forest): {recall:.2f}%")
print(f"F1 Score (Random Forest): {f1:.2f}%")

Accuracy (Random Forest): 98.05%
Training Accuracy (Random Forest): 100.00%
Test Accuracy (Random Forest): 98.05%
Precision (Random Forest): 97.86%
Recall (Random Forest): 98.05%
F1 Score (Random Forest): 97.49%


In [4]:
# Prediction of cleanup type using sample data

def predict_cleanup_type(features, model, label_encoder):
    # Ensure that the input features match the training data's features
    if len(features) != 5:
        raise ValueError(f"Expected 5 features, but received {len(features)}")
    
    # Create a DataFrame with the input features
    input_features = pd.DataFrame([features], columns=X.columns)
    
    # Use the model to predict the cleanup type
    prediction = model.predict(input_features)
    
    # Convert the numerical label back to the original cleanup type name
    predicted_cleanup_type_name = label_encoder.inverse_transform(prediction)
    
    return predicted_cleanup_type_name[0]

# Example features
# Replace with actual feature values
example_features = [47.639629, -122.333268,'1','26','2'] 

# Predict the cleanup type for the provided features
predicted_cleanup_type = predict_cleanup_type(example_features, dt_classifier, label_encoder)

# Print out the results
print(f"Predicted cleanup type: {predicted_cleanup_type}")

Predicted cleanup type: Land (beach, shoreline and inland)


___

## <u>Model 3:</u> Logistic Regression

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Latitude, Longitude, and all of the trash types are included to determine the cleanup type
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression Model with multinomial option
lr_model = LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')

# Train the model
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') * 100
recall = recall_score(y_test, y_pred, average='weighted') * 100
f1 = f1_score(y_test, y_pred, average='weighted') * 100

# Print the evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
training_accuracy = accuracy_score(y_train, lr_model.predict(X_train_scaled))
validation_accuracy = accuracy_score(y_test, y_pred)
print(f"Training Accuracy: {training_accuracy*100:.2f}%")
print(f"Test Accuracy: {validation_accuracy*100:.2f}%")
print(f"Precision : {precision:.2f}%")
print(f"Recall  {recall:.2f}%")
print(f"F1 Score : {f1:.2f}%")

Accuracy: 98.18%
Training Accuracy: 97.86%
Test Accuracy: 98.18%
Precision : 96.42%
Recall  98.18%
F1 Score : 97.29%


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Extract features and target variable
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression Model with multinomial option
lr_model = LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')

# Train the model
lr_model.fit(X_train_scaled, y_train)

# Prediction of cleanup type using sample data
def predict_cleanup_type(features, model):
    # Ensure that the input features match the training data's features
    if len(features) != 5:
        raise ValueError(f"Expected 5 features, but received {len(features)}")
    
    # Convert feature values to numeric (if they are not already)
    features = [float(x) if isinstance(x, (int, float)) else x for x in features]
    
    # Create a DataFrame with the input features
    input_features = pd.DataFrame([features], columns=X.columns)
    
    # Standardize the input features
    input_features_scaled = scaler.transform(input_features)
    
    # Use the model to predict the cleanup type
    prediction = model.predict(input_features_scaled)
    
    return prediction[0]  # Directly return the numerical class label

# Example features
# Replace with actual feature values (make sure they are numeric)
example_features = [47.639629, -122.333268, 1.0, 26.0, 2.0]

# Predict the cleanup type for the provided features
predicted_cleanup_type = predict_cleanup_type(example_features, lr_model)

# Print out the results
print(f"Predicted cleanup type (numerical label): {predicted_cleanup_type}")


Predicted cleanup type (numerical label): Land (beach, shoreline and inland)


___

## <u>Model 4:</u> KNN

In [7]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the K-Nearest Neighbors Classifier for multi-class classification
knn_model_multi_balanced = KNeighborsClassifier(n_neighbors=5, weights='distance')

# Train the model
knn_model_multi_balanced.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_multi_balanced_knn = knn_model_multi_balanced.predict(X_test_scaled)

# Evaluate the KNN model
accuracy_multi_balanced_knn = accuracy_score(y_test, y_pred_multi_balanced_knn)
conf_matrix_multi_balanced_knn = confusion_matrix(y_test, y_pred_multi_balanced_knn)
classification_rep_multi_balanced_knn = classification_report(y_test, y_pred_multi_balanced_knn)
training_accuracy_multi_balanced_knn = accuracy_score(y_train, knn_model_multi_balanced.predict(X_train_scaled))
validation_accuracy_multi_balanced_knn = accuracy_score(y_test, y_pred_multi_balanced_knn)
precision_multi_balanced_knn = precision_score(y_test, y_pred_multi_balanced_knn, average='weighted')
recall_multi_balanced_knn = recall_score(y_test, y_pred_multi_balanced_knn, average='weighted')
f1_multi_balanced_knn = f1_score(y_test, y_pred_multi_balanced_knn, average='weighted')

print(f"Training Accuracy (Multi-class balanced - KNN): {training_accuracy_multi_balanced_knn*100:.2f}%")
print(f"Test Accuracy : {validation_accuracy_multi_balanced_knn*100:.2f}%")
print(f"Precision : {precision_multi_balanced_knn*100:.2f}%")
print(f"Recall : {recall_multi_balanced_knn*100:.2f}%")
print(f"F1 Score : {f1_multi_balanced_knn*100:.2f}%")


Training Accuracy (Multi-class balanced - KNN): 100.00%
Test Accuracy : 98.21%
Precision : 97.77%
Recall : 98.21%
F1 Score : 97.90%


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Extract features and target variable
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Prediction function
def predict_cleanup_type_knn(features, model):
    # Ensure that the input features match the training data's features
    if len(features) != 5:
        raise ValueError(f"Expected 5 features, but received {len(features)}")
    
    # Convert feature values to numeric
    features = [float(x) if isinstance(x, (int, float)) else x for x in features]
    
    # Create a DataFrame with the input features
    input_features = pd.DataFrame([features], columns=X.columns)
    
    # Standardize the input features
    input_features_scaled = scaler.transform(input_features)
    
    # Use the model to predict the cleanup type
    prediction = model.predict(input_features_scaled)
    
    return prediction[0]

# Example features
example_features_knn = [47.639629, -122.333268, 1.0, 26.0, 2.0]

# Predict the cleanup type using KNN
predicted_cleanup_type_knn = predict_cleanup_type_knn(example_features_knn, knn_model)

# Print out the results
print(f"Predicted cleanup type (numerical label): {predicted_cleanup_type_knn}")


Predicted cleanup type (numerical label): Land (beach, shoreline and inland)


___

## <u>Model 5:</u> SVM

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC  # Import SVM Classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Latitude, Longitude, and all of the trash types are included to determine the cleanup type
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM Classifier
svm_model = SVC(kernel='linear')  # You can choose different kernels such as 'rbf', 'poly', etc.

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy*100:.2f}%")
training_accuracy = accuracy_score(y_train, svm_model.predict(X_train_scaled))
validation_accuracy = accuracy_score(y_test, y_pred)
print(f"Training Accuracy: {training_accuracy*100:.2f}%")
print(f"Test Accuracy: {validation_accuracy*100:.2f}%")
print(f"Precision : {precision*100:.2f}%")
print(f"Recall : {recall*100:.2f}%")
print(f"F1 Score : {f1*100:.2f}%")


Accuracy: 98.19%
Training Accuracy: 97.86%
Test Accuracy: 98.19%
Precision : 96.42%
Recall : 98.19%
F1 Score : 97.30%


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the preprocessed data
ocean = pd.read_csv('cleaned_ocean.csv')

# Extract features and target variable
X = ocean[['Latitude', 'Longitude', 'Miles', 'Pounds', 'Trash level']]
y = ocean['Cleanup Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM model
svm_model = SVC(kernel='rbf', random_state=42)  # Example with RBF kernel

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Prediction function
def predict_cleanup_type_svm(features, model):
    # Ensure that the input features match the training data's features
    if len(features) != 5:
        raise ValueError(f"Expected 5 features, but received {len(features)}")
    
    # Convert feature values to numeric
    features = [float(x) if isinstance(x, (int, float)) else x for x in features]
    
    # Create a DataFrame with the input features
    input_features = pd.DataFrame([features], columns=X.columns)
    
    # Standardize the input features
    input_features_scaled = scaler.transform(input_features)
    
    # Use the model to predict the cleanup type
    prediction = model.predict(input_features_scaled)
    
    return prediction[0]

# Example features
example_features_svm = [47.639629, -122.333268, 1.0, 26.0, 2.0]

# Predict the cleanup type using SVM
predicted_cleanup_type_svm = predict_cleanup_type_svm(example_features_svm, svm_model)

# Print out the results
print(f"Predicted cleanup type (numerical label): {predicted_cleanup_type_svm}")

Predicted cleanup type (numerical label): Land (beach, shoreline and inland)
