In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from PIL import Image
from numpy import asarray
import os
import numpy
import glob
from google.colab import files
from google.colab import drive
from sklearn.linear_model import LogisticRegression

In [3]:
#Gets Dataset Image data and classes
def loadDataSubSet(imageDirectory, imageSetPath):
    NUM_PIXELS = 49152 #128x128 pixels * 3 values for RGB
    CLASSES = ['aeroplane'] 

    #Open ImageSet File
    imageSetFile = open(imageSetPath, 'r')
    imgsInSet = [] 
    
    print("Starting to load subset of images in: ",imageSetPath)

    #Read in all Images in the ImageSet
    while (True):
        line = imageSetFile.readline().splitlines()
        #If end line - exit loop 
        if not line: 
            break

        #Convert the file name to a clean path to the associated file
        cleanLine = str(line)[1:-1].replace('\'', '')
        cleanPath = os.path.join(imageDirectory,'*/{}.png'.format(cleanLine))
        fullPath = glob.glob(cleanPath)
        cleanFullPath = str(fullPath)[1:-1].replace('\'', '')
        imgsInSet.append(cleanFullPath)
        #print(cleanFullPath)

    print("Starting to process images in: ",imageSetPath)

    #Load the images
    #Initialize the containers
    numImages = len(imgsInSet)
    x_data = numpy.empty([numImages, NUM_PIXELS])
    y_data = numpy.empty([numImages,])

    #For each image, get image in appropriate format for x_data 
        #and class for y_data
    i = 0
    for filename in imgsInSet:
        #Reshape data from image file
        x_data[i] = asarray(Image.open(filename)).flatten().reshape(1, -1)
        
        #Identify the class that the image is part of. Convert to int
        filePath = os.path.dirname(filename)
        className = os.path.basename(filePath)
        j = 0
        for c in CLASSES:
            if (c == className):
                y_data[i] = j
                break
            j = j + 1

        i = i + 1
    print("Done: ",imageSetPath)
    #print(x_data)
    y_data = y_data.astype(int)
    #print(y_data)
    return x_data, y_data

#Test loading digits dataset for comparison
def loadImagesTest():
    digits = load_digits()
    X_digits, y_digits = load_digits(return_X_y=True)
    print("[loadImagesTest]: X_digits Type: ",format(type(X_digits)))
    print("[loadImagesTest]: X_digits Shape: ",format(X_digits.shape))
    print("[loadImagesTest]: X_digit: ",format(X_digits))

    print("[loadImagesTest]: y_digits Type: ",format(type(y_digits)))
    print("[loadImagesTest]: y_digits Shape: ",format(y_digits.shape))
    print("[loadImagesTest]: y_digit: ",format(y_digits))
    #print("feature_names: ", format(digits.feature_names))
    #print("target_names: ", format(digits.target_names))
    X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)
    print(X_train)
    print(X_test)
    print(y_train)
    print(y_test)

#loadImagesTest()

#Save Resized images and Imagesets to google drive
# drive.mount('https://drive.google.com/drive/shared-with-me')
# imageDirectory = r'drive/u/0/shared-with-me/ResizedPNGImagesSmall'
# imageSetsDirectory = r'drive/u/0/shared-with-me/ImageSets/Main'
#Save Resized images and Imagesets to google drive
drive.mount('/content/drive')
imageDirectory = r'drive/My Drive/ResizedPNGImagesSmall/ResizedPNGImagesSmall'
imageSetsDirectory = r'drive/My Drive/ImageSets/ImageSets/Main'


imageSetTrainPath = os.path.join(imageSetsDirectory,'train.txt')
imageSetValPath = os.path.join(imageSetsDirectory,'val.txt')

x_train, y_train = loadDataSubSet(imageDirectory, imageSetTrainPath)
x_test, y_test = loadDataSubSet(imageDirectory, imageSetValPath)

log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
print("Training")
print(log_reg.fit(x_train, y_train))
print("End Training.... Starting Test")
print(log_reg.score(x_test, y_test))


Mounted at /content/drive
Starting to load subset of images in:  drive/My Drive/ImageSets/ImageSets/Main/train.txt
Starting to process images in:  drive/My Drive/ImageSets/ImageSets/Main/train.txt
Done:  drive/My Drive/ImageSets/ImageSets/Main/train.txt
Starting to load subset of images in:  drive/My Drive/ImageSets/ImageSets/Main/val.txt
Starting to process images in:  drive/My Drive/ImageSets/ImageSets/Main/val.txt
Done:  drive/My Drive/ImageSets/ImageSets/Main/val.txt
Training
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
End Training.... Starting Test
0.5148548857976988


In [4]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

In [5]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=70, random_state=42)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(x_train, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=70, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=42,
                        tol=0.0001, verbose=0)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=5000,
                                    multi_class='ovr', n_jobs=None,
                                    penalty='l2', random_state=42,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [6]:
pipeline.score(x_test, y_test)

0.5320281641765413