# Setup

In [None]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, \
                    classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, ReLU, \
                                        Lambda, GlobalAveragePooling2D, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50,ResNet101,ResNet152
from tensorflow.keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras import layers, Model, optimizers


from tqdm.auto import tqdm

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

import pickle

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Utils

In [None]:
def feature_extractor(data, image_size, base_model):

    input_shape = (image_size, image_size, 3)
    inputs = layers.Input(shape=input_shape,name = 'image_input')
    x = base_model(inputs)
    x = layers.Flatten()(x)
    model = models.Model(inputs=inputs, outputs=x)

    features_retrieved = []

    for img_path in tqdm(data, total=len(data), desc="Retrieving features"):
        img = image.load_img(img_path, target_size=(image_size, image_size))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        features = model.predict(x, verbose=0)
        features_reduce =  features.squeeze()

        features_retrieved.append(features_reduce)

    return features_retrieved

# Hyperprameters

In [None]:
IMAGE_SIZE = 150
BASE_MODEL = ResNet101(weights='imagenet', include_top=False, input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
SIMILARITY_THRESHOLD = 0.9

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5


## Reading data

In [None]:
labeled_data = pd.read_csv("data/ext/trainlabels.csv")

labeled_data_path = "data/ext/train"
labeled_data_images = [labeled_data_path + "/" + i + ".jpeg" for i in labeled_data['image']]

labeled_data_labels = labeled_data['level']

print("labeled data images", len(labeled_data_images))
print("labeled data labels", len(labeled_data_labels))

nsample = 5

print(labeled_data_images[nsample], labeled_data_labels[nsample])

labeled_data.iloc[nsample, :]

labeled_data_labels.value_counts()

labeled data images 35122
labeled data labels 35122
data/ext/train/15_right.jpeg 2


0    25808
2     5291
1     2442
3      873
4      708
Name: level, dtype: int64

In [None]:
unlabeled_data_path = "data/ext/test"
unlabeled_data_images = os.listdir(unlabeled_data_path)
unlabeled_data_images = [unlabeled_data_path + "/" + i for i in unlabeled_data_images]

print("Unlabeled data samples", len(unlabeled_data_images))

Unlabeled data samples 53570


# Extracting features

In [None]:
labeled_data_features = feature_extractor(
    data=labeled_data_images,
    image_size=IMAGE_SIZE,
    base_model=BASE_MODEL)

print("Labeled features", len(labeled_data_features))
os.makedirs("data/processed/train/features/", exist_ok=True)
np.save('data/processed/train/features/labeled_data_features.npy', labeled_data_features)
print("Features saved at data/processed/train/features/labeled_data_features.npy")

Retrieving features:   0%|          | 0/35122 [00:00<?, ?it/s]

Labeled features 35122
Features saved at data/processed/train/features/labeled_data_features.npy


In [None]:
unlabeled_data_features = feature_extractor(
    data=unlabeled_data_images,
    image_size=IMAGE_SIZE,
    base_model=BASE_MODEL)

print("Labeled features", len(unlabeled_data_features))
os.makedirs('data/processed/unlabeled/features/', exist_ok=True)
np.save('data/processed/unlabeled/features/unlabeled_data_features.npy', unlabeled_data_features)
print("Features saved at data/processed/unlabeled/features/unlabeled_data_features.npy")

Retrieving features:   0%|          | 0/53570 [00:00<?, ?it/s]

Labeled features 53570
Features saved at data/processed/unlabeled/features/unlabeled_data_features.npy
