**iWildCam 2019 EDA
**

The aim of this competition is to classify animal species based on images collected. 
The competition is using training data and test data from different regions, mainly Southwest America  and Northwest America. The target variable is "category_id"

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import matplotlib.pyplot as plt
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import RMSprop


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import gc
import os
import json
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
import matplotlib.pyplot as plt
import keras
from keras import layers
from keras.applications import DenseNet121
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
warnings.filterwarnings('ignore')

In [None]:
train_df=pd.read_csv('../input/train.csv')

In [None]:
test_df=pd.read_csv('../input/test.csv')

To check how many categories (target) there are:

In [None]:
train_df.category_id.nunique()

Selecting one photo randomly for each category:

In [None]:
fig = plt.figure(figsize=(25, 60))
imgs = [np.random.choice(train_df.loc[train_df['category_id'] == i, 'file_name'], 4) for i in train_df.category_id.unique()]
imgs = [i for j in imgs for i in j]
labels = [[i] * 4 for i in train_df.category_id.unique()]
labels = [i for j in labels for i in j]
for idx, img in enumerate(imgs):
    ax = fig.add_subplot(14, 4, idx + 1, xticks=[], yticks=[])
    im = Image.open("../input/train_images/" + img)
    plt.imshow(im)
    ax.set_title(f'Label: {labels[idx]}')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

we can see that the category_id is the target variable.  Each category is an animal and the category 0 is "empty" that means no animal in the picture. 
I think useful variables are:
Date: we can define day, month, hour, season to understand when it is more luckly to see a particular.
Location: maybe animals are location specific. 
Seq_num_frames: to see if  it is more luckly to have an empty category when there is only one frame.


Let's do an assessment about what are the variables with highest correlation with the target. 


In [None]:
trainset_c=train_df
labels = []
values = []
for col in trainset_c.columns:
    if col not in [ "category_id"] and trainset_c[col].dtype!='object':
        labels.append(col)
        values.append(np.corrcoef(trainset_c[col].values, trainset_c["category_id"].values)[0,1])
corr_df = pd.DataFrame({'columns_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
 
corr_df = corr_df[(corr_df['corr_values']>0.20) | (corr_df['corr_values']<-0.20)]
ind = np.arange(corr_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(10,6))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='gold')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.columns_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables")
plt.show()

In [None]:
train_df['category_id'].loc[train_df['category_id']]
plt.figure(figsize=(12,10))
sns.violinplot(x='category_id', y='seq_num_frames', data=train_df)
plt.xlabel('category_id', fontsize=12)
plt.ylabel('seq_num_frames', fontsize=12)
plt.show()

we can see that if the catogory is empty the majority in this category has only one frame. The rest is quite well distributed. 


In [None]:
from datetime import date, datetime
train_df['date_time'] = pd.to_datetime(train_df['date_captured'], errors='coerce')
train_df["year"] = train_df['date_time'].dt.year
train_df["month"] = train_df['date_time'].dt.month
train_df["day"] = train_df['date_time'].dt.day
train_df["hour"] = train_df['date_time'].dt.hour

train_df['season'] = train_df.month.map({1:4, 2:4, 3:1, 4:1, 5:1, 6:2, 7:2,8:2,9:3,10:3, 11:3,12:4})

#1= spring 
#2=summer
#3=autumn
#4=winter

In [None]:
import matplotlib.ticker as ticker
train_df['category_id'].loc[train_df['category_id']]
plt.figure(figsize=(12,10))
ax= sns.violinplot(x='category_id', y='season', data=train_df)
plt.xlabel('category_id', fontsize=12)
plt.ylabel('season', fontsize=12)


#ax = sns.boxplot(data = np.random.rand(20,30))

ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.show()

we can clearly see that the category_id=22 is appearing only  during summer and autumn

Rights_holder is also important because it could be that a specific photographer is specialised in some categories more than in others. Let's tranform this variable in 0 (Justin B.) or 1 (Erin..)

In [None]:
train_df['Rights_holderN'] = train_df.rights_holder.map({'Justin Brown':0, 'Erin Boydston':1})

In [None]:
train_df.head()

In [None]:
train_df['category_id'] = train_df['category_id'].astype(str)

To create the best model it is important to split the training database into training and test set

SOLUTION 1

In [None]:
from keras.models import Sequential
#Import from keras_preprocessing not from keras.preprocessing
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
import pandas as pd
import numpy as np

In [None]:
batch_size=32
img_size = 32
nb_epochs = 10

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.25)
train_generator = train_datagen.flow_from_dataframe(
dataframe = train_df,        
directory = '../input/train_images',
x_col = 'file_name', y_col = 'category_id',
subset="training",
target_size=(img_size,img_size),
batch_size=batch_size,
class_mode='categorical')

In [None]:

valid_generator=train_datagen.flow_from_dataframe(
dataframe=train_df,
directory="../input/train_images/",
x_col="file_name",
y_col="category_id",
subset="validation",
batch_size=batch_size,
seed=42,
class_mode="categorical",
target_size=(img_size,img_size))


test_datagen=ImageDataGenerator(rescale=1./255.)

test_generator=test_datagen.flow_from_dataframe(
dataframe=test_df,
directory="../input/test_images/",
x_col="file_name",
y_col=None,
batch_size=200,
seed=42,

class_mode=None,
target_size=(96,96))

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=(32,32,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(32, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(14, activation='softmax'))
model.compile(optimizers.rmsprop(lr=0.0001, decay=1e-6),loss="categorical_crossentropy",metrics=["accuracy"])

In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10
)

In [None]:
model.evaluate_generator(generator=valid_generator,
steps=STEP_SIZE_TEST)

In [None]:
test_generator.reset()
pred=model.predict_generator(test_generator,
steps=STEP_SIZE_TEST,
verbose=1)

In [None]:
predicted_class_indices=np.argmax(pred,axis=1)

In [None]:
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

In [None]:
filenames=test_generator.filenames
results=pd.DataFrame({"Filename":filenames,
                      "Predictions":predictions})
results.to_csv("results.csv",index=False)


SOLUTION 1 END

PYTORCH applied

In [None]:
print(os.listdir("../input"))

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from keras.models import Sequential
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
import pandas as pd
import numpy as np

In [None]:
from keras.models import Sequential
#Import from keras_preprocessing not from keras.preprocessing
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
import pandas as pd
import numpy as np
def append_ext(fn):
    return fn+".png"
traindf=pd.read_csv(“./trainLabels.csv”,dtype=str)
testdf=pd.read_csv("./sampleSubmission.csv",dtype=str)
traindf["id"]=traindf["id"].apply(append_ext)
testdf["id"]=testdf["id"].apply(append_ext)
datagen=ImageDataGenerator(rescale=1./255.,validation_split=0.25)

In [None]:
train_datagen = ImageDataGenerator( validation_split=0.2,
                               rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_df["category_id"]= str(train_df["category_id"])

Splitting Training set into trainset and validationset

In [None]:
train_generator=train_datagen.flow_from_dataframe(
                    dataframe=train_df,
                    directory="../input/train_images/",
                    x_col="file_name",
                    y_col="category_id",
                    subset="training",
                    batch_size=200,
                    seed=42,
                    shuffle=True,
                    class_mode="categorical",
                    target_size=(96,96))

In [None]:
valid_generator=train_datagen.flow_from_dataframe(
                    dataframe=train_df,
                    directory="../input/train_images/",
                    x_col="file_name",
                    y_col="category_id",
                    subset="validation",
                    batch_size=200,
                    seed=42,
                    shuffle=True,
                    class_mode="categorical",
                    target_size=(96,96))

In [None]:
from keras.applications.mobilenet import MobileNet
from keras.layers import Dense, Input, Dropout
from keras.models import Model

In [None]:
train_generator

In [None]:
IMG_SHAPE = Input(shape=(96,96, 3))

In [None]:
import tensorflow as tf

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('acc')>0.998):
      print("\nReached 99.8% accuracy so cancelling training!")
      self.model.stop_training = True

callbacks = myCallback()

model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_generator, training_labels, epochs=10, callbacks=[callbacks])


In [None]:
base_model.trainable = False
model = tf.keras.Sequential([base_model,
                             tf.keras.layers.GlobalAveragePooling2D(),
                             tf.keras.layers.Dense(1024,activation='relu'),  
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(n_classes, activation='softmax')])

In [None]:
model.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['acc']

In [None]:
model.summary()

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('weights-improvement.{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_loss', 
                                                verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
EPOCHS=10
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size + 1
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size + 1
history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=EPOCHS,
                    callbacks=[checkpoint],          
                    verbose=2
)

Now we work on the test set

In [None]:
test_generator=test_datagen.flow_from_dataframe(
                dataframe=df_test,
                directory="../input/test_images/",
                x_col="file_name",
                y_col=None,
                batch_size=200,
                seed=42,
                shuffle=False,
                class_mode=None,
                target_size=(96,96))
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size + 1
test_generator.reset()


Prediction:

In [None]:
pred=model.predict_generator(test_generator,
                steps=STEP_SIZE_TEST,
                verbose=1)

In [None]:
np.argmax(pred,axis=1)
predicted_class_indices=np.argmax(pred,axis=1)

In [None]:
submission=pd.DataFrame({"Id":df_test.id,
                      "Predicted":predictions})
submission.to_csv("submission.csv",index=False)

references:
https://www.kaggle.com/artgor/iwildcam-basic-eda,
https://www.kaggle.com/xhlulu/keras-cnn-starter-petfinder/,
https://www.kaggle.com/bonhart/pytorch-eda-and-resnet, 
https://www.kaggle.com/rblcoder/cnn-in-tf-coursera-course-iwildcam-2019-mobilenet