# Whale and Dolphin Classification Project

Authors:
- Victor Möslein
- Maren Rieker
- Reed Garvin
- Dinah Rabe

This Notebook is one of three core notebooks of the Whale and Dolphin Classification Project for the "Machine Learning" class at the Hertie School of Governance. It focuses on the application of classic machine learning models to the task at hand. There is one other notebook concerned with data preprocessing and another that focuses on the application of a deep learning model. 

The code of this nootebook partly follows the chapter on Classification from the book "Hands-on Machine Learning with Scikit-Learn, Keras, and Tensorflow" by Aurélien Géron.

In [None]:
## Setup: System settings and packages

In [None]:
# Python ≥3.5 is required

import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import random
import numpy as np
from numpy import load
import pandas as pd
from numpy import savez_compressed
import os
import timeit
import seaborn as sns
import pickle
import PIL


# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# to make this notebook's output stable
np.random.seed(42)

In [None]:
full_data_switch_on = False # if the full data set should be used, this switch need to be set to true

## Define paths to data and for output

In [None]:
# path to clean data folder
ROOT_PATH_DATA = "input/04_cleaned/"

# where to save figures
ROOT_PATH_FIG = "output/ml_models/01_figures"
os.makedirs(ROOT_FIGS, exist_ok=True)

# where to save output

ROOT_OUTPUT = "output/ml_models/"
OUTPUT_PATH_TRAIN_EVAL = os.path.join(ROOT_OUTPUT + "02_training_set_evaluation")
OUTPUT_PATH_TEST_EVAL = os.path.join(ROOT_OUTPUT + "03_test_set_evaluation")
OUTPUT_PATH_HYPPAR_TUN = os.path.join(ROOT_OUTPUT + "04_hyperparamter_tuning")
OUTPUT_PATH_RUN_TIME = os.path.join(ROOT_OUTPUT + "05_runtime_stats")

# function to save figures

def save_fig(fig_id, SAVE_PATH=ROOT_FIG, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(SAVE_PATH, fig_id + "." + fig_extension)
    print(">... Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Loading and splitting training data

In [None]:
# load npz files
img_data_train_full 
img_data_test
# load csv files and subset relevant column
labels_train_full
labels_test
pic_ids_train_full
pic_ids_test

# Split into training and validation set
from sklearn.model_selection import train_test_split
img_data_train, img_data_val, labels_train, labels_val, pic_ids_train, pic_ids_val = train_test_split(img_data_train_full , labels_train_full, pic_ids_train_full, train_size=0.10, random_state=42)


## Implementing base line model

In [None]:
def train_clasf(classifier_x, img_data_train, labels_train):        
    # set name of classifier
    classifier_name = classifier_x.__class__.__name__
    
    # train model
    print(">... Starting training of", classifier_name)
    start_time = timeit.default_timer()
    classifier_x.fit(img_data_train, labels_train)
    time_elapsed = timeit.default_timer() - start_time
    
    print(">... Classifier {} sucessfully trained in {} seconds.".format(classifier_name, round(time_elapsed,3)))
        

In [None]:
from sklearn.linear_model import LogisticRegression

classifier_LR = LogisticRegression(random_state=42)
train_clasf(classifier_LR, img_data_train, labels_train)

In [None]:
## haben wir da eine Präferenz?

#in the multiclass case, the training algorithm uses the one-vs-rest (OvR) 
#scheme if the ‘multi_class’ option is set to ‘ovr’, 
#and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. 
#‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.
#default is "auto"

## Evaluating base line model ("compute metrics on train AND dev") 

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
# store predictions of classifier
pred =

# evaluate classifier and store metrics ## adjust name of the labels depending on train or val
evaluation_scores = {}
    evaluation_scores["Precision Score"] = precision_score(labels, pred).round(3)
    evaluation_scores["Recall Score"] = recall_score(labels, pred).round(3)
    evaluation_scores["F1 Score"] = confusion_matrix(labels, pred).round(3)

In [None]:
# save evaluation scores 
def store_eval_score(image_df):
    savez_compressed(OUTPUT_PATH_TRAIN_EVAL + "/evaluation_scores"+str(classifier_name)+".npz",image_df)
    print("file successfully stored in: output/ml_models/02_training_set_evaluation")


In [None]:
store_eval_score(evaluation_scores)

In [None]:
# inspecting the errors 
output_dict = {}
output_array = np.c_[pic_ids, labels, pred] ## adjust name of pic_ids and labels depending on train or val
    
# Create error array with specific error
err_type_arr = np.array([])
for i in range(len(output_array)):
     if output_array[i,1] != output_array[i,2]:
        err_type_arr = np.append(err_type_arr, "error")
    else:
        err_type_arr = np.append(err_type_arr, "No error")

error_table_pd = pd.DataFrame(output_array)
error_table_pd.rename(columns = {0:'Picture ID', 1:"Label", 2:"Predicted"}, inplace = True)
error_table_pd["Error Check"] = err_type_arr

# print filtered error table
print(error_table_pd.loc[error_table_pd["Error Check"].isin("error")].sort_values(by=["Label", "Picture ID"]))


In [None]:
# def function for saving the filtered error table

def store_error_table(image_df):
    savez_compressed(OUTPUT_PATH_TRAIN_EVAL + "/error_table"+str(classifier_name)+".npz",image_df)
    print("file successfully stored in: output/ml_models/02_training_set_evaluation")


In [None]:
# transform pd frame into dictionary for saving
output_dict["error_table"] = error_table_pd

# saving the error table
store_error_table(output_dict)

## Implementing RandomForest Classifier as advanced model