# Max Condidence Ensemble Learning on VGG16(modified) Transfer Learning

In [None]:
!rm -r sample_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -q /content/drive/MyDrive/Yoga82.zip -d /content/

In [None]:
from google.colab import drive
drive.flush_and_unmount()

In [None]:
# For Modelling
import tensorflow as tf
from keras.applications import VGG16
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import load_img, img_to_array
from keras.layers import Dense, Flatten, Dropout
from keras.models import Model

# For data segmentation
from sklearn.cluster import KMeans
import numpy as np
import os, statistics, cv2, shutil, random
from imblearn.over_sampling import SMOTE

Frequency map class name to frequency

In [None]:
def count_files(path) -> int:
    count = 0
    for root, dirs, files in os.walk(path):
        count += len(files)
    return count

def is_balanced(counts) -> bool:
    mean = sum(counts) / len(counts)
    return all(mean * 0.8 <= count <= mean * 1.2 for count in counts)

freq_map = dict()
parent_folder = '/content/Yoga82'

for root, dirs, files in os.walk(parent_folder):
    for folder in dirs:
        loc = os.path.join(root, folder)
        num = len(os.listdir(loc))
        freq_map[folder] = num

freq_map_array = np.array(list(freq_map.values())).reshape(-1, 1)
subfolder_counts = [count_files(os.path.join(parent_folder, subfolder)) for subfolder in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, subfolder))]
total_count = sum(subfolder_counts)

print(f'Total: {total_count} amongst {len(subfolder_counts)} classes')
print("Number of files in each subfolder:", subfolder_counts)

for i, count in enumerate(subfolder_counts):
    print("Class {}: {:.2f}%".format(i, 100 * count / total_count))

print(f'The classes are {"" if is_balanced(subfolder_counts) else "not"} balanced.')
num_images = subfolder_counts

Total: 19388 amongst 82 classes
Number of files in each subfolder: [80, 260, 133, 71, 309, 312, 313, 240, 179, 153, 312, 159, 278, 288, 54, 83, 155, 318, 224, 305, 260, 99, 680, 397, 349, 296, 73, 146, 800, 233, 414, 316, 127, 396, 333, 185, 39, 174, 86, 132, 291, 150, 212, 230, 305, 188, 337, 272, 151, 314, 254, 312, 132, 314, 394, 266, 184, 65, 82, 143, 226, 47, 225, 232, 534, 132, 184, 196, 178, 273, 323, 232, 71, 528, 281, 188, 162, 267, 180, 189, 84, 299]
Class 0: 0.41%
Class 1: 1.34%
Class 2: 0.69%
Class 3: 0.37%
Class 4: 1.59%
Class 5: 1.61%
Class 6: 1.61%
Class 7: 1.24%
Class 8: 0.92%
Class 9: 0.79%
Class 10: 1.61%
Class 11: 0.82%
Class 12: 1.43%
Class 13: 1.49%
Class 14: 0.28%
Class 15: 0.43%
Class 16: 0.80%
Class 17: 1.64%
Class 18: 1.16%
Class 19: 1.57%
Class 20: 1.34%
Class 21: 0.51%
Class 22: 3.51%
Class 23: 2.05%
Class 24: 1.80%
Class 25: 1.53%
Class 26: 0.38%
Class 27: 0.75%
Class 28: 4.13%
Class 29: 1.20%
Class 30: 2.14%
Class 31: 1.63%
Class 32: 0.66%
Class 33: 2.04%
C

Apply Kmeans clustering to minimize overall variance

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(freq_map_array)
cluster_labels = kmeans.labels_

In [None]:
# divide the frequency map into 4 dictionaries based on cluster labels
cluster_dict_1 = dict([(k, v) for k, v in freq_map.items() if cluster_labels[list(freq_map.keys()).index(k)] == 0])
cluster_dict_2 = dict([(k, v) for k, v in freq_map.items() if cluster_labels[list(freq_map.keys()).index(k)] == 1])
cluster_dict_3 = dict([(k, v) for k, v in freq_map.items() if cluster_labels[list(freq_map.keys()).index(k)] == 2])

# get the standard deviation of each cluster
cluster_1_std = statistics.stdev(list(cluster_dict_1.values()))
cluster_2_std = statistics.stdev(list(cluster_dict_2.values()))
cluster_3_std = statistics.stdev(list(cluster_dict_3.values()))

# calculate the average of each cluster
cluster_1_avg = sum(list(cluster_dict_1.values())) / len(list(cluster_dict_1.values()))
cluster_2_avg = sum(list(cluster_dict_2.values())) / len(list(cluster_dict_2.values()))
cluster_3_avg = sum(list(cluster_dict_3.values())) / len(list(cluster_dict_3.values()))

# calculate the percentage variation from the centroid for each cluster
cluster_1_percent_var = (cluster_1_std / cluster_1_avg) * 100
cluster_2_percent_var = (cluster_2_std / cluster_2_avg) * 100
cluster_3_percent_var = (cluster_3_std / cluster_3_avg) * 100

# print the percentage variation from the centroid for each cluster
print("Percentage variation from the centroid for Cluster 1: ", cluster_1_percent_var, f'({len(cluster_dict_1)})')
print("Percentage variation from the centroid for Cluster 2: ", cluster_2_percent_var, f'({len(cluster_dict_2)})')
print("Percentage variation from the centroid for Cluster 3: ", cluster_3_percent_var, f'({len(cluster_dict_3)})')

num_images.sort()
print('*' * 80)
print(f'{num_images} = {sum(num_images)}')

Percentage variation from the centroid for Cluster 1:  16.80284527767622 (40)
Percentage variation from the centroid for Cluster 2:  37.28837032013109 (38)
Percentage variation from the centroid for Cluster 3:  20.496440337929727 (4)
********************************************************************************
[39, 47, 54, 65, 71, 71, 73, 80, 82, 83, 84, 86, 99, 127, 132, 132, 132, 133, 143, 146, 150, 151, 153, 155, 159, 162, 174, 178, 179, 180, 184, 184, 185, 188, 188, 189, 196, 212, 224, 225, 226, 230, 232, 232, 233, 240, 254, 260, 260, 266, 267, 272, 273, 278, 281, 288, 291, 296, 299, 305, 305, 309, 312, 312, 312, 313, 314, 314, 316, 318, 323, 333, 337, 349, 394, 396, 397, 414, 528, 534, 680, 800] = 19388


SMOTE

## Generate Folders from dict

In [None]:
def get_folder_dict(root_folder):
    folder_dict = {}
    for root, dirs, files in os.walk(root_folder):
        for dir in dirs:
            folder_path = os.path.join(root, dir)
            folder_dict[dir] = os.path.abspath(folder_path)
    return folder_dict

subfolder_dict = get_folder_dict(parent_folder)
image_data = []

In [None]:
# for class_label, subfolder_path in subfolder_dict.items():
#     for image_file in os.listdir(subfolder_path):
#         image = cv2.imread(os.path.join(subfolder_path, image_file))
#         image_data.append((image, class_label))

# image_data = np.array(image_data)

# images = image_data[:, 0]
# labels = image_data[:, 1]

# smote = SMOTE(sampling_strategy='minority', k_neighbors=5)
# images_resampled, labels_resampled = smote.fit_resample(images, labels)

Transformation

In [None]:
input_folder = '/content/Yoga82'
output_folder = '/content/Yoga82Mod'

target_num = 800

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    fill_mode='nearest')

for subfolder in os.listdir(input_folder):
    output_subfolder = os.path.join(output_folder, subfolder)
    if not os.path.exists(output_subfolder):
        os.makedirs(output_subfolder)
        
    images = os.listdir(os.path.join(input_folder, subfolder))
    
    num_images = target_num - len(images)
    
    if num_images > 0:
        for i in range(num_images):
            image_path = os.path.join(input_folder, subfolder, random.choice(images))
            
            img = load_img(image_path)
            x = img_to_array(img)
            x = x.reshape((1,) + x.shape)
            datagen.fit(x)
            for batch in datagen.flow(x, batch_size=1):
                output_path = os.path.join(output_subfolder, f'augmented_{i}.jpg')
                img = array_to_img(batch[0])
                img.save(output_path)
                break
                
train_folder = os.path.join(output_folder, 'train')
test_folder = os.path.join(output_folder, 'test')
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
if not os.path.exists(test_folder):
    os.makedirs(test_folder)

for subfolder in os.listdir(output_folder):
    if subfolder in ['train', 'test']:
        continue
    
    images = os.listdir(os.path.join(output_folder, subfolder))
    
    random.shuffle(images)
    train_images = images[:int(0.8*len(images))]
    test_images = images[int(0.8*len(images)):]

    for image in train_images:
        src_path = os.path.join(output_folder, subfolder, image)
        dst_path = os.path.join(train_folder, subfolder, image)
        shutil.copyfile(src_path, dst_path)
    for image in test_images:
        src_path = os.path.join(output_folder, subfolder, image)
        dst_path = os.path.join(test_folder, subfolder, image)
        shutil.copyfile(src_path, dst_path)


NameError: ignored

In [None]:
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224,224,3))
base_model.trainable = False

x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(107, activation='softmax')(x)

In [None]:
# model = Model(inputs=base_model.input, outputs=predictions)
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# train_datagen = ImageDataGenerator(rescale=1./255)
# train_data = train_datagen.flow_from_directory('Yoga82', target_size=(224,224), batch_size=32, class_mode="categorical")

In [None]:
# history = model.fit_generator(train_data, steps_per_epoch=5994 // 32, epochs=7)

In [None]:
# model.save('transfer.h5')