In [1]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [8]:
path = r"D:/trial/dataset1/flower_images/flower_images"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
flowers = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.png'):
          # adds only the image files to the flowers list
            flowers.append(file.name)
            
print(flowers[:10])

['0001.png', '0002.png', '0003.png', '0004.png', '0005.png', '0006.png', '0007.png', '0008.png', '0009.png', '0010.png']


In [9]:
# # load the image as a 224x224 array
# img = load_img(flowers[0], target_size=(224,224))
# img = np.array(img)

# print(img.shape)

# reshaped_img = img.reshape(1,224,224,3)
# print(reshaped_img.shape)

# x = preprocess_input(reshaped_img)

# model = VGG16()
# # remove the output layer
# model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# features = model.predict(x)
# print(features.shape)

In [10]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [11]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [12]:
data = {}
p = r"D:/trial/flower_features.pkl"

# lop through each image in the dataset
for flower in flowers:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(flower,model)
        data[flower] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

#reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('flower_labels.csv')
label = df['label'].tolist()
unique_labels = list(set(label))
print(unique_labels)

(210, 1, 4096)
(210, 4096)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [13]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [14]:
print("Components before PCA: ", feat.shape[1])
print("Components after PCA: ", pca.n_components)

Components before PCA:  4096
Components after PCA:  100


In [15]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)
kmeans.labels_

array([6, 6, 8, 1, 1, 9, 4, 1, 0, 1, 4, 6, 6, 7, 7, 9, 6, 1, 4, 4, 8, 2,
       7, 8, 4, 5, 8, 4, 3, 6, 6, 5, 4, 4, 3, 0, 4, 6, 8, 1, 6, 6, 7, 6,
       9, 2, 7, 6, 0, 9, 3, 2, 7, 6, 6, 5, 4, 1, 4, 0, 8, 9, 7, 7, 1, 4,
       6, 6, 8, 2, 4, 7, 6, 9, 4, 6, 5, 4, 0, 3, 9, 2, 3, 6, 1, 4, 8, 2,
       5, 2, 7, 9, 4, 8, 9, 4, 6, 6, 0, 6, 9, 8, 2, 4, 9, 8, 0, 4, 0, 6,
       4, 2, 8, 6, 4, 6, 0, 5, 1, 1, 4, 8, 2, 4, 9, 6, 8, 6, 0, 5, 4, 8,
       9, 2, 4, 0, 1, 5, 8, 2, 9, 2, 6, 4, 8, 7, 6, 2, 9, 3, 8, 2, 7, 4,
       9, 5, 3, 4, 3, 6, 4, 9, 3, 8, 0, 8, 7, 9, 9, 4, 9, 9, 3, 9, 0, 4,
       9, 2, 8, 7, 6, 4, 0, 8, 6, 6, 6, 0, 3, 8, 2, 7, 9, 4, 0, 3, 5, 4,
       8, 2, 3, 6, 4, 6, 2, 3, 1, 2, 6, 3])

In [16]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

# filenames in cluster 0
groups[0]

['0009.png',
 '0036.png',
 '0049.png',
 '0060.png',
 '0079.png',
 '0099.png',
 '0107.png',
 '0109.png',
 '0117.png',
 '0129.png',
 '0136.png',
 '0165.png',
 '0175.png',
 '0183.png',
 '0188.png',
 '0195.png']

In [17]:
groups[2]

['0022.png',
 '0046.png',
 '0052.png',
 '0070.png',
 '0082.png',
 '0088.png',
 '0090.png',
 '0103.png',
 '0112.png',
 '0123.png',
 '0134.png',
 '0140.png',
 '0142.png',
 '0148.png',
 '0152.png',
 '0178.png',
 '0191.png',
 '0200.png',
 '0205.png',
 '0208.png']

# Feature Extraction

## For original dataset

In [18]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [19]:
path = r"D:/trial/dataset"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
imgs = []
count = 0
# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.JPEG'):
          # adds only the image files to the flowers list
            imgs.append(file.name)
#         count += 1 
        
# print(count)
# print(flowers)

In [20]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [21]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [22]:
data = {}
p = r"D:/trial/flower_features.pkl"

# lop through each image in the dataset
for img in imgs:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(img,model)
        data[img] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

#reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('D:/trial/class_labels.csv')
label = df['unique_id'].tolist()
unique_labels = list(set(label))
# print(unique_labels)

(1582, 1, 4096)
(1582, 4096)


In [23]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [24]:
print("Components before PCA: ", feat.shape[1])
print("Components after PCA: ", pca.n_components)

Components before PCA:  4096
Components after PCA:  100


In [25]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)
kmeans.labels_

array([116, 369, 695, ..., 118, 413, 251])

In [1]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

# filenames in cluster 0
groups[3]

NameError: name 'filenames' is not defined

## After applying PCA

In [28]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [31]:
path1 = r"D:/trial/OUTPUT/pca"
# change the working directory to the path where the images are located
os.chdir(path1)

# this list holds all the image filename
pca_imgs = []
count = 0
# creates a ScandirIterator aliased as files
with os.scandir(path1) as files1:
#   loops through each file in the directory
    for file1 in files1:
        if file1.name.endswith('.jpeg'):
          # adds only the image files to the flowers list
            pca_imgs.append(file1.name)
        count += 1 
        
print(count)
print(pca_imgs[:10])

1618
['image00000.jpeg', 'image00001.jpeg', 'image00002.jpeg', 'image00003.jpeg', 'image00004.jpeg', 'image00005.jpeg', 'image00006.jpeg', 'image00007.jpeg', 'image00008.jpeg', 'image00009.jpeg']


In [32]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [33]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [35]:
data = {}
p = r"D:/trial/pca_features.pkl"

# lop through each image in the dataset
for pca_img in pca_imgs:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(pca_img,model)
        data[pca_img] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

#reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('D:/trial/class_labels.csv')
label = df['unique_id'].tolist()
unique_labels = list(set(label))
# print(unique_labels)

(1618, 1, 4096)
(1618, 4096)


In [49]:
pca = PCA(n_components=60, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [50]:
print("Components before PCA: ", feat.shape[1])
print("Components after PCA: ", pca.n_components)

Components before PCA:  4096
Components after PCA:  60


In [51]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)
kmeans.labels_

array([151, 291, 191, ..., 707, 287, 856])

In [60]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

# filenames in cluster 0
groups[3]

['image00194.jpeg',
 'image00217.jpeg',
 'image00237.jpeg',
 'image00239.jpeg',
 'image00833.jpeg',
 'image01000.jpeg',
 'image01126.jpeg']

## After applying noise removal on output of PCA

In [61]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [62]:
path2 = r"D:/trial/OUTPUT/noise-removed-pca"
# change the working directory to the path where the images are located
os.chdir(path2)

# this list holds all the image filename
no_noise_imgs = []
count = 0
# creates a ScandirIterator aliased as files
with os.scandir(path2) as files2:
#   loops through each file in the directory
    for file2 in files2:
        if file2.name.endswith('.jpeg'):
          # adds only the image files to the flowers list
            no_noise_imgs.append(file2.name)
        count += 1 
        
print(count)
print(no_noise_imgs[:10])

1618
['image01618.jpeg', 'image01619.jpeg', 'image01620.jpeg', 'image01621.jpeg', 'image01622.jpeg', 'image01623.jpeg', 'image01624.jpeg', 'image01625.jpeg', 'image01626.jpeg', 'image01627.jpeg']


In [63]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [64]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [65]:
data = {}
p = r"D:/trial/no_noise_features.pkl"

# lop through each image in the dataset
for no_noise_img in no_noise_imgs:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(no_noise_img,model)
        data[no_noise_img] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

#reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('D:/trial/class_labels.csv')
label = df['unique_id'].tolist()
unique_labels = list(set(label))
# print(unique_labels)

(1618, 1, 4096)
(1618, 4096)


In [66]:
pca = PCA(n_components=60, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [67]:
print("Components before PCA: ", feat.shape[1])
print("Components after PCA: ", pca.n_components)

Components before PCA:  4096
Components after PCA:  60


In [77]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)
kmeans.labels_

array([609, 207, 669, ..., 231, 610, 393])

In [78]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

# filenames in cluster 0
groups[]

['image03203.jpeg',
 'image03212.jpeg',
 'image03221.jpeg',
 'image03228.jpeg',
 'image03233.jpeg']

## Only resized and grayscaled

In [90]:
path3 = r"D:/trial/OUTPUT/resized"
# change the working directory to the path where the images are located
os.chdir(path3)

# this list holds all the image filename
resized_imgs = []
count = 0
# creates a ScandirIterator aliased as files
with os.scandir(path3) as files3:
#   loops through each file in the directory
    for file3 in files3:
        if file3.name.endswith('.jpeg'):
          # adds only the image files to the flowers list
            resized_imgs.append(file3.name)
        count += 1 
        
print(count)
print(resized_imgs[:10])

1618
['image00000.jpeg', 'image00001.jpeg', 'image00002.jpeg', 'image00003.jpeg', 'image00004.jpeg', 'image00005.jpeg', 'image00006.jpeg', 'image00007.jpeg', 'image00008.jpeg', 'image00009.jpeg']


In [91]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [92]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [93]:
data = {}
p = r"D:/trial/resized_features.pkl"

# lop through each image in the dataset
for resized_img in resized_imgs:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(resized_img,model)
        data[resized_img] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
            

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

#reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('D:/trial/class_labels.csv')
label = df['unique_id'].tolist()
unique_labels = list(set(label))
# print(unique_labels)

(1618, 1, 4096)
(1618, 4096)


In [94]:
pca = PCA(n_components=60, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [95]:
print("Components before PCA: ", feat.shape[1])
print("Components after PCA: ", pca.n_components)

Components before PCA:  4096
Components after PCA:  60


In [98]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)
array = list(kmeans.labels_)
print(array)

[744, 10, 413, 10, 372, 290, 674, 290, 266, 237, 266, 295, 266, 237, 10, 730, 964, 266, 956, 675, 237, 288, 675, 674, 674, 230, 399, 945, 290, 266, 705, 10, 456, 266, 39, 461, 558, 413, 503, 503, 503, 242, 541, 242, 149, 160, 585, 39, 288, 737, 704, 161, 149, 607, 91, 860, 405, 753, 558, 780, 597, 126, 160, 763, 383, 614, 787, 700, 645, 161, 313, 937, 106, 361, 801, 372, 313, 313, 313, 313, 937, 372, 849, 106, 149, 796, 313, 196, 149, 313, 39, 23, 447, 849, 624, 709, 194, 313, 194, 313, 593, 196, 313, 372, 937, 675, 456, 326, 149, 148, 194, 38, 645, 384, 849, 80, 4, 643, 395, 849, 675, 820, 149, 83, 148, 732, 643, 643, 4, 148, 329, 332, 948, 372, 536, 129, 82, 873, 643, 494, 240, 849, 911, 148, 332, 606, 240, 148, 326, 888, 849, 26, 430, 313, 643, 26, 937, 935, 937, 675, 129, 149, 240, 129, 38, 100, 828, 795, 80, 262, 432, 935, 262, 767, 167, 332, 100, 174, 536, 106, 911, 38, 313, 911, 849, 38, 26, 911, 198, 332, 23, 592, 603, 641, 580, 435, 844, 240, 650, 498, 886, 723, 168, 791, 982,

In [105]:
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

# filenames in cluster 0
groups[19]

['image01585.jpeg',
 'image01587.jpeg',
 'image01592.jpeg',
 'image01594.jpeg',
 'image01598.jpeg',
 'image01602.jpeg',
 'image01603.jpeg',
 'image01604.jpeg',
 'image01609.jpeg',
 'image01610.jpeg',
 'image01613.jpeg',
 'image01615.jpeg']