# Import Libraries

In [49]:
import tensorflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten, MaxPooling2D, GlobalAveragePooling2D
from keras.models import Sequential, load_model
from keras.preprocessing.image import ImageDataGenerator
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping
from tensorflow.keras.applications import EfficientNetB0
#from google.colab import drive
from keras.utils.traceback_utils import include_frame

tensorflow.random.set_seed(42)



# Connect to drive

In [2]:
""" drive.mount('/content/drive/') """

" drive.mount('/content/drive/') "

# Open zip in google Drive

In [3]:
""" from zipfile import ZipFile
file_name = 'drive/MyDrive/datasets/data.zip'

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done') """

" from zipfile import ZipFile\nfile_name = 'drive/MyDrive/datasets/data.zip'\n\nwith ZipFile(file_name, 'r') as zip:\n  zip.extractall()\n  print('Done') "

# Import EDA Data

In [4]:
# Helpers

#                ['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']
# Classification:           1           2        3           4          5        6              7               8
def merge_animals(df):
    """_summary_

    Args:
        df (_type_): _description_

    Returns:
        _type_: _description_
    """
    df_copy = df.copy() # copy df
    columns = df_copy.columns # get column names
    df_copy['animal_classification'] = np.where(df_copy.values)[1]+1 # add a numeric value to each column
    df_copy.drop(columns ,axis=1, inplace=True) # drop columns that were just combined
    return df_copy

def plot_metrics(model_fit):
    metrics = ['accuracy', 'precision', 'recall']
    for i in metrics:
        plt.plot(model_fit.history[i], label='Train')
        plt.plot(model_fit.history[f'val_{i}'], label='Test')
        plt.ylabel(i)
        plt.xlabel('Epochs')
        plt.legend()
        plt.show()

In [5]:
test_features = pd.read_csv('../data/test_features.csv')
train_features = pd.read_csv('../data/train_features.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [6]:
test_features.head(3)

Unnamed: 0,id,filepath,site
0,ZJ016488,test_features/ZJ016488.jpg,S0082
1,ZJ016489,test_features/ZJ016489.jpg,S0040
2,ZJ016490,test_features/ZJ016490.jpg,S0040


In [7]:
train_features.head(3)

Unnamed: 0,id,filepath,site
0,ZJ000000,train_features/ZJ000000.jpg,S0120
1,ZJ000001,train_features/ZJ000001.jpg,S0069
2,ZJ000002,train_features/ZJ000002.jpg,S0009


In [8]:
train_labels.head(3)

Unnamed: 0,id,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent
0,ZJ000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ZJ000001,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,ZJ000002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Make Validation And Train Data

In [9]:
# associate image directories to their respective image labels
train = pd.merge(left=train_features, right=train_labels, on='id', how='inner') 

In [10]:
train.head(3)

Unnamed: 0,id,filepath,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent
0,ZJ000000,train_features/ZJ000000.jpg,S0120,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ZJ000001,train_features/ZJ000001.jpg,S0069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,ZJ000002,train_features/ZJ000002.jpg,S0009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# generate a column that has the actual label ID of the image
train['animal_classification'] = merge_animals(train[['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']]) 

In [12]:
train.head(3)

Unnamed: 0,id,filepath,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent,animal_classification
0,ZJ000000,train_features/ZJ000000.jpg,S0120,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,ZJ000001,train_features/ZJ000001.jpg,S0069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7
2,ZJ000002,train_features/ZJ000002.jpg,S0009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [13]:
# replace label ID with label name of each image
train['animal_classification'] = train['animal_classification'].map({1:'antelope_duiker', 2:'bird', 3:'blank', 4:'civet_genet', 5:'hog', 6:'leopard', 7:'monkey_prosimian', 8:'rodent'})

In [14]:
train.head(3)

Unnamed: 0,id,filepath,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent,animal_classification
0,ZJ000000,train_features/ZJ000000.jpg,S0120,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird
1,ZJ000001,train_features/ZJ000001.jpg,S0069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,monkey_prosimian
2,ZJ000002,train_features/ZJ000002.jpg,S0009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird


In [15]:

# adding columns to train dataset that store image file location and file name
temp = train['filepath'].str.split(pat='/',expand=True) # split file path column to get file names
temp.rename(columns={0: 'old_folder_location', 1: 'filename'}, inplace=True) # rename split columns
train = pd.concat([train, temp], axis=1).drop(columns=['filepath'],axis=1) # concat columns and original df

# same for test (holdout set for predictions)
temp2 = test_features['filepath'].str.split(pat='/', expand=True)
temp2.rename(columns={0: 'old_folder_location', 1: 'filename'}, inplace=True) # rename split columns
test = pd.concat([test_features, temp2], axis=1).drop(columns=['filepath'],axis=1) # concat columns


In [16]:
train.head(3)

Unnamed: 0,id,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent,animal_classification,old_folder_location,filename
0,ZJ000000,S0120,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird,train_features,ZJ000000.jpg
1,ZJ000001,S0069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,monkey_prosimian,train_features,ZJ000001.jpg
2,ZJ000002,S0009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird,train_features,ZJ000002.jpg


In [27]:
train.isna().sum() # hold-out set ha no NAs

id                       0
site                     0
antelope_duiker          0
bird                     0
blank                    0
civet_genet              0
hog                      0
leopard                  0
monkey_prosimian         0
rodent                   0
animal_classification    0
old_folder_location      0
filename                 0
dtype: int64

In [28]:
test.head(3)

Unnamed: 0,id,site,old_folder_location,filename
0,ZJ016488,S0082,test_features,ZJ016488.jpg
1,ZJ016489,S0040,test_features,ZJ016489.jpg
2,ZJ016490,S0040,test_features,ZJ016490.jpg


In [29]:
# make validation set
validation_set = train[(train['site']=='S0009') | (train['site']=='S0043')| (train['site']=='S0059') |(train['site']== 'S0026')] # get validation set for 4 sites

In [30]:
validation_set.head(3)

Unnamed: 0,id,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent,animal_classification,old_folder_location,filename
2,ZJ000002,S0009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird,train_features,ZJ000002.jpg
9,ZJ000009,S0059,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird,train_features,ZJ000009.jpg
44,ZJ000044,S0059,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,civet_genet,train_features,ZJ000044.jpg


In [31]:
# make training set
train_set = train[~train.isin(validation_set)].dropna() # remove the observations from train that are in the validation set

In [32]:
train_set.head(3)

Unnamed: 0,id,site,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent,animal_classification,old_folder_location,filename
0,ZJ000000,S0120,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bird,train_features,ZJ000000.jpg
1,ZJ000001,S0069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,monkey_prosimian,train_features,ZJ000001.jpg
3,ZJ000003,S0008,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,monkey_prosimian,train_features,ZJ000003.jpg


In [33]:
validation_set['animal_classification'].value_counts(normalize=True)

monkey_prosimian    0.187879
leopard             0.168485
rodent              0.146667
bird                0.142424
civet_genet         0.122424
hog                 0.113939
antelope_duiker     0.064848
blank               0.053333
Name: animal_classification, dtype: float64

# Set up Model Data

In [34]:
train_path = '../data/train_features_img/'
test_path = '../data/test_features_img'

In [35]:
labels = list(train.columns[2:11])
labels

['antelope_duiker',
 'bird',
 'blank',
 'civet_genet',
 'hog',
 'leopard',
 'monkey_prosimian',
 'rodent',
 'animal_classification']

In [36]:
# image Gen stuff
img_gen = ImageDataGenerator(
        # brightness_range=[.4, 1.2],
)
val_generator = img_gen.flow_from_dataframe(
    validation_set, 
    directory=train_path, 
    x_col='filename', 
    y_col='animal_classification', 
    #y_col=labels,
    target_size=(256, 256), 
    class_mode='categorical',
    #class_mode='raw',
    batch_size=32,
    seed=42
)
train_generator = img_gen.flow_from_dataframe(
    train_set, 
    directory=train_path, 
    x_col='filename', 
    y_col='animal_classification', 
    #y_col=labels, 
    target_size=(256, 256), 
    class_mode='categorical',
    #class_mode='raw',
    batch_size=32,
    seed=42
)

Found 1650 validated image filenames belonging to 8 classes.
Found 14838 validated image filenames belonging to 8 classes.


In [47]:
# hold-out set of images for predictions
img_gen2 = ImageDataGenerator()
test_generator = img_gen2.flow_from_dataframe(
    test,
    directory=test_path,
    x_col='filename',
    class_mode=None,
    target_size=(256, 256),
    batch_size=32
)

Found 4464 validated image filenames.


# Model Testing

In [32]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy, Recall, Precision
from tensorflow.keras.losses import CategoricalCrossentropy

In [44]:
model = Sequential()
# layers
model.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Flatten())

model.add(Dense(units=32, activation='relu'))
# output layer
model.add(Dense(units=8, activation='softmax'))

model.compile(loss=CategoricalCrossentropy(), optimizer=Adam(), metrics=[CategoricalAccuracy(), Recall(), Precision()])

history = model.fit(
    train_generator,
    batch_size=32,
    epochs=1,
    validation_data=val_generator
)



In [45]:
#model.save('./model/')

In [51]:
smodel = load_model('./model/')
smodel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, None, None, 16)    448       
                                                                 
 max_pooling2d (MaxPooling2D  (None, None, None, 16)   0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 dense (Dense)               (None, 100)               25806500  
                                                                 
 dense_1 (Dense)             (None, 8)                 808       
                                                                 
Total params: 25,807,756
Trainable params: 25,807,756
Non-trainable params: 0
____________________________________________

## Predictions

In [52]:
smodel.predict(test_generator)

array([[0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704],
       [0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704],
       [0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704],
       ...,
       [0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704],
       [0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704],
       [0.13816541, 0.11880668, 0.13331893, ..., 0.13230278, 0.13193077,
        0.12551704]], dtype=float32)