# **L&T EduTech Hackathon at SHAASTRA IITM**

Here we will try to find optimum training data in percentage with the help of pretrained model that we selected earlier. The judging metric is Cohen Kappa score. 

### Kappa Score:
Cohen’s kappa measures the agreement between two raters who each classify N items into C mutually exclusive categories.

More on Cohen kappa score can be found [here](https://towardsdatascience.com/cohens-kappa-9786ceceab58).

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from keras import layers
import keras.backend as K
from keras.models import Sequential, Model
from keras.preprocessing import image
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import Flatten, BatchNormalization, Conv2D
from keras.layers import MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D 
from keras.applications.imagenet_utils import preprocess_input


from PIL import Image
from tqdm import tqdm
from keras.preprocessing.image import ImageDataGenerator
from numpy import expand_dims

!pip install opendatasets
import opendatasets as od

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
od.download('https://www.kaggle.com/datasets/arpitjain007/game-of-deep-learning-ship-datasets/code')

In [None]:
dataset_path = r'/content/game-of-deep-learning-ship-datasets'  # main data folder path
train_path = os.path.join(dataset_path, 'train')
images_path = os.path.join(train_path, 'images')
test_csv = os.path.join(dataset_path, 'test_ApKoW4T.csv')
train_csv = os.path.join(train_path, 'train.csv')

test_df = pd.read_csv(test_csv)
train_df = pd.read_csv(train_csv) # it contains image names and their respective labels

num_test_img = len(test_df)
num_train_img = len(train_df)

print(f'Total Number of test images: {num_test_img}')
print(f'Total Number of train images: {num_train_img}')

In [None]:
main_path = "/content/game-of-deep-learning-ship-datasets/train/images/"   # it contains all the images
main_df = pd.read_csv(r'/content/game-of-deep-learning-ship-datasets/train/train.csv')
paths = os.listdir(main_path)
main_df['path'] = main_path + main_df['image']    # In the path column, paths of all images saved.

categories = list(main_df['category'])   # Saves list of ship categories.
categorys = {1:'Cargo', 2:'Military', 3:'Carrier', 4:'Cruise', 5:'Tankers'}

In [None]:
classes = []
for category in categories:
    classes.append(categorys[category])         # Ship category is saved in classes sequencially.


main_df['classes'] = classes
test_df = pd.read_csv(r'/content/game-of-deep-learning-ship-datasets/test_ApKoW4T.csv')
test_df['path'] = main_path + test_df['image']

In [None]:
widths, heights = [], []

for path in tqdm(main_df["path"]):
    width, height = Image.open(path).size
    widths.append(width)
    heights.append(height)
    
main_df["width"] = widths
main_df["height"] = heights
main_df["dimension"] = main_df["width"] * main_df["height"]

In [None]:
X, y = main_df[['path', 'classes']], main_df['classes']

X_data, X_test, y_data, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  # 10% test and validation data
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)  # 5% test data and 5% validation data

In [None]:
datagen = ImageDataGenerator(rotation_range=20,
    zoom_range=0.10,
    brightness_range=[0.6,1.4],
    channel_shift_range=0.7,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
) 

val_generator = datagen.flow_from_dataframe(
        X_val,  # This is the source directory for training images
        x_col='path',
        y_col='classes',
        target_size=(224, 224),  # All images will be resized to 150x150
        batch_size=32,
        class_mode="categorical",
        shuffle=True,
)
test_generator = datagen.flow_from_dataframe(
        X_test,  # This is the source directory for training images
        x_col='path',
        y_col='classes',
        target_size=(224, 224),  # All images will be resized to 150x150
        batch_size=32,
        class_mode="categorical",
        shuffle=False,
)

In [None]:
from tensorflow.keras.applications.xception import Xception
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

xception_model = Xception(include_top=False, weights='imagenet', input_shape=IMG_SHAPE)

In [None]:
test_size1 = [1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0]
from keras.models import load_model

kappa_score = {}
saved_models = ['model10.h5','model20.h5','model30.h5','model40.h5','model50.h5','model60.h5','model70.h5','model80.h5','model90.h5','model100.h5']
X_train = {}
X_v = {}
y_train= {}
y_v = {}
for i in range(len(test_size1)):
  X_train[i], X_v[i], y_train[i], y_v[i] = train_test_split(X_data, y_data, test_size= test_size1[i]/0.9, random_state=42)
  
  train_generator = datagen.flow_from_dataframe(
        X_train[i],  # This is the source directory for training images
        x_col='path',
        y_col='classes',
        target_size=(224, 224),  # All images will be resized to 224x224
        batch_size=32,
        class_mode="categorical",
        shuffle=True,
        )

  model_xception = tf.keras.Sequential([
  xception_model,
  tf.keras.layers.Conv2D(128, 3, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling2D(),
  tf.keras.layers.Dense(5, activation='softmax')
  ])

  model_xception.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

  history_xception = model_xception.fit(
      train_generator,
      validation_data=val_generator,
      epochs=20,
      verbose=2)

  model_xception.save(saved_models[i])

  y_pred = model_xception.predict(test_generator)

  y_pred_classes = np.argmax(y_pred, axis = 1)
  print("\nAccuracy:",metrics.accuracy_score(test_generator.labels, y_pred_classes))
  print('\nF1 Score is',f1_score(test_generator.labels, y_pred_classes, average='weighted'))
  kappa_score[i] = cohen_kappa_score(test_generator.labels, y_pred_classes)
  print('\nCohen Kappa Score is\n ',kappa_score[i])

In [None]:
train_data_per = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
kappa_score = np.array(list(kappa_score.items())).T[1]
plt.plot(train_data_per,kappa_score)