****

# **Google Landmark Recognition🗽 🗼**
**This notebook contains the analyzing and cleaning process of the dataset and at the end the training**

#### Import and download libraries

In [None]:
import os
import random
import seaborn as sns
import cv2

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import PIL
import IPython.display as ipd
import glob
import h5py
import plotly.graph_objs as go
import plotly.express as px
from PIL import Image
from tempfile import mktemp
from bokeh.plotting import figure, output_notebook, show
from math import pi

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

output_notebook()

from IPython.display import Image, display

import warnings
warnings.filterwarnings("ignore")

#### Load dataset

In [None]:
DATASET_DIR = '../input/landmark-recognition-2021'

TRAIN_IMAGE_DIR = f'{DATASET_DIR}/train'
TEST_IMAGE_DIR = f'{DATASET_DIR}/test'

train = pd.read_csv(f'{DATASET_DIR}/train.csv')

#### Explore training dataset

In [None]:
print(train.head())
print("Training data shape :", train.shape)

In [None]:
train.isnull().sum()

In [None]:
value_counts = train['landmark_id'].value_counts() # normalize=True returns relative frequency

freq_df = pd.DataFrame(value_counts)
freq_df.reset_index(inplace=True)
freq_df.columns = ['landmark_id','frequency']
freq_df

#### Prepare dataset
There is a total of **81313** different classes for landmarks. Because this is a great amount we were planning to only take an **x percentage** of each class and delete classes with less than x frequency. This plan wont work out because more than 41k classes have less than 10 images and there are only 7 classes with more than 1000 images. For a good model you need at least 1000 images per class. 

In [None]:
freq_df[freq_df['frequency'] < 10]

In [None]:
# freq_df = train[~train.isin(freq_df)].dropna(how ='all')

In [None]:
value_counts.index[:10].tolist()

#### Create a new column with jpg url

In [None]:
def jpgurl(df, dir_name='../input/landmark-recognition-2021/train/'):
    """This function will create a url based on the first 3 values of ID"""
    for row in range(len(df.index)):
        df.at[row, 'url'] = os.path.join(dir_name, df['id'][row][0], df['id'][row][1], df['id'][row][2], df['id'][row] + "." + 'jpg')
    return df

In [None]:
sample = jpgurl(train)
sample.head()

#### Non-landmarks should be deleted from the dataset but that's not within our scope

In [None]:
fig = plt.figure(figsize=(30,30))
columns = 10
rows = 10
for i in range(1, columns*rows +1):
    img = PIL.Image.open(sample['url'][i], mode='r')
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
plt.show()

**The dataframe we use to train:** sample

## **Training** 

In [None]:
!pip install ../input/keras-efficientnet-whl/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/keras-efficientnet-whl/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import tensorflow as tf
import efficientnet.keras as efn
import tensorflow.keras.layers as L
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing import image
from random import shuffle
from sklearn.model_selection import train_test_split
import math

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, img_size, img_channel, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.img_size = img_size
        self.img_channel = img_channel
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, 1), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            
            image_id = self.data.loc[ID, 'id']
            file = image_id+'.jpg'
            subpath = '/'.join([char for char in image_id[0:3]]) 
            img = cv2.imread(self.path+subpath+'/'+file)
            img = img/255
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img
            if self.path.find('train')>=0:
                y[i, ] = self.data.loc[ID, 'landmark_id']
            else:
                y[i, ] = 0
        return X, y
    
img_size = 256
img_channel = 3

batch_size = 1
sub = pd.read_csv('../input/landmark-recognition-2021/sample_submission.csv')
list_IDs_test = list(sub.index)

test_generator = DataGenerator('../input/landmark-recognition-2021/'+'test/', list_IDs_test, sub, img_size, img_channel, batch_size)

In [None]:
model = tf.keras.models.load_model('../input/effnetb0/efficientnetb0_notop.h5')