In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import cv2
import tqdm
from IPython.display import Image
%matplotlib inline

# Importing Training Data


In [None]:
data = pd.read_csv('../input/identify-the-dance-form/train.csv')

In [None]:
data.describe()

# Class Distribution

In [None]:
sns.set()
data.target.value_counts().plot(kind='bar', y='Count', colormap='Paired').set_title('Dance Forms')

In [None]:
data.target.value_counts().plot(figsize=(20,10))

### *Target Variable is evenly distributed, except for Manipuri having the lowest frequency*

In [None]:
y = data.target

## Reading a sample image

In [None]:
Image(filename='../input/identify-the-dance-form/train/1.jpg') 

In [None]:
Image(filename='../input/identify-the-dance-form/train/10.jpg') 

In [None]:
Image(filename='../input/identify-the-dance-form/train/12.jpg') 

# Importing image into the dataframe

In [None]:
X = []
y = []
j=0
path='/kaggle/input/identify-the-dance-form/train'
for img in tqdm.tqdm(data['Image']):
    img_path = os.path.join(path,img)
    img = cv2.imread(img_path)
    X.append(img)
    y.append(data['target'][j])
    j = j+1

In [None]:
data['Actual_Image'] = X

In [None]:
type(data.iloc[1,2]),data.iloc[1,2].shape

In [None]:
type(data.iloc[1,2]),data.iloc[18,2].shape

***We are dealing images of varying sizes of RGB colour format. But for working with this dataset, we need to resize all the images into a constant dimension**

## Plotting images from each category

In [None]:
data_group = data.groupby('target', as_index=False)

In [None]:
rows, columns = (4,2)
fig=plt.figure(figsize=(20, 20))
i = 1
for group_name, group_df in data_group:
    fig.add_subplot(rows, columns, i)
    group_df = group_df.reset_index()
    plt.imshow(group_df['Actual_Image'][np.random.randint(low=0, high=len(group_df))])
    plt.gca().set_title(group_name)
    i = i+1
plt.show()

In [None]:
rows, columns = (4,2)
fig=plt.figure(figsize=(20, 20))
i = 1
for group_name, group_df in data_group:
    fig.add_subplot(rows, columns, i)
    group_df = group_df.reset_index()
    plt.imshow(group_df['Actual_Image'][np.random.randint(low=0, high=len(group_df))])
    plt.gca().set_title(group_name)
    i = i+1
plt.show()

# Applying K Nearest Neighbours

## For applying various algorithms, we require to have the images of same dimensions

In [None]:
img_height, img_width = (224,224)

In [None]:
data['Actual_Image'] = data['Actual_Image'].apply(lambda x : cv2.resize(x,(img_height, img_width)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['Actual_Image'].apply(lambda x: x.flatten()), data['target'], test_size=0.3, random_state=42 )

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

*Need to change the shape of X_train*

In [None]:
arr = []
for each in X_train:
    arr.append(each)
arr = np.array(arr)


In [None]:
arr.shape

In [None]:
X_train = arr

In [None]:
arr = []
for each in X_test:
    arr.append(each)
arr = np.array(arr)

In [None]:
arr.shape

In [None]:
X_test = arr

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Building and Training the Model

### Simplest Model Possible : K Nearest Neighbours 

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knn_model.fit(X_train, y_train)

## Accuracy Score is 🙁

In [None]:
knn_model.score(X_test, y_test)

## Trying out with different values of k

In [None]:
accuracy_score_list = []
for k in tqdm.tqdm(range(3, 50)):
    knn_model = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn_model.fit(X_train, y_train)
    accuracy_score_list.append(knn_model.score(X_test, y_test))


## K Vs Accuracy

In [None]:
sns.lineplot(range(3,50), accuracy_score_list)
plt.title("K Vs Accuracy")
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.show()

## So, 30% is the highest accuracy with this basic model, which occurs at k=4

### Since the problem is an image classificatin problem, such distance based simple ML Models won't work well. Thus, our next approach will be Convolutional Neural Network