### IMPORTING LIBRARIES

In [1]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Dropout, Flatten,Conv2D, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.applications import InceptionV3, VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

ModuleNotFoundError: No module named 'cv2'

In [None]:
#seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)  

### READING DATA

- `train_data` & `test_data` are the csv files given on zindi
- Both train and test images are in the same folder, so `train_path` and `test_path` are the same

In [None]:
#reading data
#Change path to your images and csv files path 
train_data = pd.read_csv('train_ids_labels.csv')
test_data = pd.read_csv('test_ids_only.csv')
train_path = 'all_data/'
test_path = 'all_data/'

In [None]:
train_data.head()

###  creating train and test dataframe

- Here we create two dataframes so we can use tensorflows `.flow_from_dataframe` to load our images

In [None]:

#we will create dataframes and use keras Image data preprocessing function: flow_from_dataframe

# train
train_names = train_data.Image_ID.values
train_labels = np.asarray(train_data['Label'].values)
train_dict = {'filepath': train_path + train_names + '.JPG', 'Label': train_labels}
train = pd.DataFrame(data=train_dict)

# test
test_names = test_data.Image_ID.values
test_dict = {'filepath': test_path + test_names + '.JPG'}
test = pd.DataFrame(data=test_dict)

In [None]:
train.head()

In [None]:
train.shape, test.shape

### Quick EDA

In [None]:
#visualizing target distribution 
f, ax = plt.subplots(1,1, figsize=(8,5))
sns.countplot(train['Label'],order = train['Label'].value_counts().index)
plt.title("Target Distribution")
plt.show()

In [None]:
#visualizing random images
nrows = 3
rands = np.random.randint(train.shape[0], size=nrows**2)
fig = plt.figure(1,figsize=(12,10))

for i in range(nrows**2):
    img = cv2.imread(train.loc[rands[i], 'filepath'])
    ax = plt.subplot(nrows, nrows, i+1)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title(train.loc[rands[i], 'Label'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train['Label'] = train['Label'].apply(str)


In [None]:
#lets split our training data into train and validation set
df_train , df_val = train_test_split(train, test_size=0.3, random_state=42)
print(df_train.Label.value_counts())
print(df_val.Label.value_counts())

In [None]:
#data generators
batch_size = 32  
image_size = (224, 224)
learning_rate = 0.1

train_steps = np.ceil(len(df_train) / batch_size)
val_steps = np.ceil(len(df_val) / batch_size )

classes = train.Label.unique().tolist()


# Slight Data Augementation too
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,horizontal_flip=True,
                             vertical_flip=True,shear_range=10,zoom_range=0.2,width_shift_range=0.1,
                             height_shift_range=0.1,channel_shift_range=10.)

val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_ds = train_datagen.flow_from_dataframe(df_train,x_col='filepath', y_col='Label',
                                        target_size=image_size,classes=classes,
                                        batch_size=batch_size,
                                        class_mode='categorical', shuffle=True, seed=SEED)

val_ds = val_datagen.flow_from_dataframe(df_val,x_col='filepath', y_col='Label',
                                        target_size=image_size,classes=classes,
                                        batch_size=batch_size,
                                        class_mode='categorical', shuffle=False, seed=SEED)

test_ds = val_datagen.flow_from_dataframe(test, x_col='filepath',target_size=image_size, class_mode=None,
                                          shuffle=False,batch_size=batch_size)#set shuffle=False for test dataset not be shuffled

### MODELING (Using transfer learning)

In [None]:
def BuildModel (pretrained=VGG16):

    base_model = pretrained(include_top=False, weights='imagenet', input_shape=(224, 224,3)) #include_top=False to remove the last layer 

    for layer in base_model.layers:
        layer.trainable = False   #freeze trainable layers

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(3072, activation='relu', )(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    output = Dense(2, activation='softmax')(x) 
    
    model = Model(base_model.input, output)
    
    return model

In [None]:
model = BuildModel()

In [None]:
optimizer = keras.optimizers.Adam(lr=learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['AUC'])

In [None]:
earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
chkpt_path = os.path.join("/content/drive/MyDrive/MIIA Pothole Image Classification/models/", f"tb_model.h5", )
checkpoint = ModelCheckpoint(chkpt_path, monitor='val_loss',mode='auto', verbose=1, save_best_only=True,)

In [None]:
history = model.fit(train_ds, epochs=5,steps_per_epoch=train_steps, callbacks=[earlystop, checkpoint],
                    verbose=1, shuffle=False,validation_data=(val_ds), validation_steps= val_steps)

In [None]:
#AUC
y_pred = model.predict(val_ds, verbose=1)[:, 1] 
fpr, tpr, thresholds = roc_curve(val_ds.classes, y_pred)
auc = auc(fpr, tpr)
auc

### SUBMISSION

In [None]:
#make predictions on test data
predictions = model.predict(test_ds)[:, 1]

In [None]:

submission = pd.DataFrame()
submission['Image_ID'] = test_names
submission['Label'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

## Some useful Insights

- Try out other pretrained models (ResNet, EfficientNet etc )
- Here i trained for just 5 epochs, try training for more epochs and monitor the loss ( You could use a lower learning rate too)
- Here, I used few ramdom data augmentation parameters (the one i used for a former task), try reason the type of data augmumentation that will be suitable for this particular task.

#### Image Prepocessing idea
- You should notice that most of the images are taken from the dashboard of the car, the dashboard seem to very visible in the images, Since potholes will always be on the road and not the dashboard,  Is there a way to crop out that unnecessary part of the image. So the model can focus on the roads alone