# Image Based Sports Classification

## Library Imports

In [1]:
import os
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
%matplotlib inline


# IMPORT THE REQUIRED KERAS LIBRARIES FOR IMAGE AUGMENTATION
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input

## import model libraries
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import SGD,Adam
from tensorflow.keras.layers import Dense,Dropout,Conv2D,MaxPool2D,AvgPool2D,GlobalMaxPool2D,Flatten,MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.callbacks import CSVLogger


In [None]:
# ignore deprecated and future warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

## Data Path

In [2]:
file_path= '/kaggle/input/logical-rythm-2k20-sports-image-classification'

In [7]:
train_path= os.path.join(file_path,'train')
test_path = os.path.join(file_path,'test')
# images in directory
train_images = os.path.join(train_path,'train')
test_images = os.path.join(test_path,'test')


In [8]:
print("size of train data:", len(os.listdir(train_images)))
print("size of test data:", len(os.listdir(test_images)))
# read the train and test csv files for labels
train_labels = pd.read_csv(os.path.join(file_path,'train_labels.csv'))
test_labels = pd.read_csv(os.path.join(file_path,'test_images_list.csv'))
# check if data is loaded properly
assert( len(os.listdir(train_images))== len(train_labels))
assert( len(os.listdir(test_images))== len(test_labels))

In [9]:
train_labels.info()

In [13]:
# Extract the unique labels
sports_labels = train_labels['sports'].unique()

In [14]:
# check all the sports labels
sports_labels

In [154]:
# sample train image
plt.imshow(imread(os.path.join(train_images,'0.jpg')));

In [15]:
# IMAGE COUNT PER SPORT
train_labels['sports'].value_counts()

In [16]:
# VISUALIZE IMAGE COUNT PER SPORT
sns.countplot(train_labels['sports'],palette='viridis')
plt.title('Images per Sports',)
plt.ylabel('Number of images')
plt.xlabel('Sports Name')
plt.tight_layout()

Badminton and Football has the maximum number of images in the dataset. Kabaddi is the least. Hence we can conclude that the data is not biased for a specific class.


## Data Preparation


In [18]:
image_shape = (224,224,3)


In [None]:
# normalize and do data augmentation
data_generator = ImageDataGenerator(    
    rotation_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.5,
    zoom_range=[0.5,2.0],
    channel_shift_range=0.1,
    fill_mode='nearest',
    horizontal_flip=True,
    vertical_flip=False,
    rescale=1./255,
    preprocessing_function=preprocess_input,
    dtype= 'float32'
)

In [25]:
batch=32
# training genereator  
train_generator = data_generator.flow_from_dataframe(dataframe=train_labels,
                                                      directory=train_path,
                                                      x_col='image',
                                                      y_col='sports',
                                                      subset='training',
                                                      color_mode='rgb',
                                                      batch_size=batch,
                                                      seed=42,
                                                      shuffle=True,
                                                      class_mode='categorical',
                                                      target_size=(224,224))

train_generator
# define callbacks
early_stop = EarlyStopping(patience=9,monitor='val_loss',restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(patience=2,monitor='val_loss',factor=0.1)
csv_logger = CSVLogger('epoch_run_cnn.csv',separator=',',append=True)

**MODELS**

CNN Model

In [30]:
model =  Sequential()

# 3 layers of convolution cells
model.add(Conv2D(filters=32,kernel_size=(7,7),input_shape=image_shape,strides=1,padding='same',activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(filters=64,kernel_size=(3,3),strides=1,padding='same',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=128,kernel_size=(3,3),strides=1,padding='same',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=256,kernel_size=(3,3),strides=1,padding='same',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

# dense layers
model.add(Flatten())
model.add(Dense(units=512,activation='relu'))
model.add(Dense(units=128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=22,activation='softmax'))

# compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])  

# summary 
print('BASIC MODEL')
model.summary()


In [31]:
model.fit_generator(generator=train_generator,
                   steps_per_epoch=train_generator.n//train_generator.batch_size,
                   epochs=35,
                   callbacks=[early_stop,reduce_lr,csv_logger]
                   )

In [38]:
model.save("/kaggle/working/"+'cnn_model.h5')

In [44]:
history= pd.read_csv("./epoch_run_transfer_.csv")
history.head(5)
# Plot history
plt.plot(history['loss'], label='loss')
plt.plot(history['accuracy'], label='accuracy')

plt.title('loss for cnn model')
plt.ylabel('MAE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

VGG16 Transfer Learning Model

In [165]:
base_model = VGG16(include_top=False,input_shape=(224,224,3),weights='imagenet')
# mark loaded layers as not trainable
for layer in base_model.layers:
    layer.trainable = False
# add new classification layers
flat1 = Flatten()(base_model.layers[-1].output)
class1 = Dense(512,activation='relu',kernel_initializer='he_normal')(flat1)
class2 = Dense(256,activation='relu',kernel_initializer='he_normal')(class1)
class3 = Dense(128,activation='relu',kernel_initializer='he_normal')(class2)
output = Dense(22,activation='softmax')(class3)
# define new model
model = Model(inputs=base_model.inputs,outputs=output)
#compile the model
opt = Adam(lr=0.0001)
model.compile(optimizer=opt,loss='categorical_crossentropy',metrics='accuracy')

print('Transfer Learning based on VGG16')
model.summary()
#base_model.trainable = True


In [170]:
model.fit_generator(generator=train_generator_full,
                   steps_per_epoch=train_generator_full.n//train_generator_full.batch_size,
                   epochs=30,
                   callbacks=[early_stop,reduce_lr,csv_logger]
                   )
model.save_weights('./my_checkpoint')
model.save('final_model.h5')


# Test on test images


In [46]:
test_generator = image_generator.flow_from_dataframe(dataframe=test_labels,
                                                     directory=test_path,
                                                     x_col='image',
                                                     y_col=None,
                                                     batch_size=batch,
                                                     color_mode='rgb',
                                                     seed=42,
                                                     shuffle=False,
                                                     class_mode=None,
                                                     target_size=(224,224))
# generate predicitons on test data
predictions = model.predict_generator(test_generator)
# get class with max probability.hence np.argmax

predictions_class_index = np.argmax(predictions,axis=1)
# extract labels from int classes
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
final_predictions = [labels[k] for k in predictions_class_index]

In [47]:
# LIST THE FINAL PREDICTED LABELS
final_predictions

In [48]:
len(final_predictions)

In [52]:
# save the results
filenames=test_generator.filenames
results=pd.DataFrame({"image":filenames,
                      "sports":final_predictions})
results.to_csv("results.csv",index=False)