To run:

Download:
    Fruits 360 dataset from kaggle
    list1.txt
    list2.txt
    list3.txt
    list4.txt
    list5.txt
    list6.txt
    real1.txt
    realBananaTest
    
Goals:
    Find algorithms that are good at distinguishing between different 
    types of fruit based on their greyscale images. There are two 
    datasets. The first is from kaggle and the second is a hand made
    one taken from random pictures on the internet.
    
Group: Nick Weinel, Danny Harcourt

Findings: Algorithms SVM, Decision Tree, Grid Search, and GaussianNB all performed better than the Neural Network.

When the algorithms were trained with the kaggle dataset and tested against the real world banana images, only the Gaussian performed well. All of the other algorithms had little to no success.


In [9]:
import numpy as np
import pandas as pd
from scipy import misc, signal, ndimage
import matplotlib.pyplot as plt

In [3]:
# Make a 2D numpy array
# Allocate enough space for each fruit
nBanana = 490
nAvocado = 427
nPom = 492
nLemon = 492
nStrawberry = 492
nOrange = 479
n = nBanana + nAvocado + nPom + nLemon + nStrawberry + nOrange

# first 10K columns: image pixel values, last column: target (which fruit)
nCols = 10000 + 1

# declare a 2darray to hold the images and their targets
fruits = np.zeros((n, nCols))


In [98]:
print(fruits.shape)

(2872, 10001)


In [5]:
# Convert Image to greyscale
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

In [99]:
# Add banana images to array
bananaList = open('list1.txt')

for r in range(nBanana):
    
        # read file data to array
        img = bananaList.readline()[:-1]
        banana = plt.imread('fruits/Training/' + img)
        
        # convert array to greyscale
        graybanana = rgb2gray(banana)
        graybanana = graybanana.reshape(10000,1)
        
        # Add to 2D array
        for c in range(10000):
            fruits[r][c] = graybanana[c][0]
        fruits[r][10000] = 0

In [100]:
# Add avacado images to array
avocadoList = open('list2.txt')

for r in range(nAvocado):
    
        # read file data to array
        img = avocadoList.readline()[:-1]
        avocado = plt.imread('fruits/Training/Avocado/' + img)
        
        # convert array to greyscale
        grayavocado = rgb2gray(avocado)
        grayavocado = grayavocado.reshape(10000, 1)
        
        # Add to 2D array
        for c in range(10000):
            fruits[r+nBanana][c] = grayavocado[c][0]
        fruits[r+nBanana][10000] = 1

In [101]:
# Add pomegranate images to array
pomegranateList = open('list3.txt')

for r in range(nPom):
    
        # read file data to array
        img = pomegranateList.readline()[:-1]
        pomegranate = plt.imread('fruits/Training/Pomegranate/' + img)
        
        # convert array to greyscale
        graypomegranate = rgb2gray(pomegranate)
        graypomegranate = graypomegranate.reshape(10000,1)
        
        # Add to 2D array
        for c in range(10000):
            fruits[r+nBanana+nAvocado][c] = graypomegranate[c][0]
        fruits[r+nBanana+nAvocado][10000] = 2

In [102]:
# Add lemon images to array
lemonList = open('list4.txt')

for r in range(nLemon):
    
        # read file data to array
        img = lemonList.readline()[:-1]
        lemon = plt.imread('fruits/Training/Lemon/' + img)
        
        # convert array to greyscale
        graylemon = rgb2gray(lemon)
        graylemon = graylemon.reshape(10000,1)
        
        # Add to 2D array
        for c in range(10000):
            fruits[r+nBanana+nAvocado+nPom][c] = graylemon[c][0]
        fruits[r+nBanana+nAvocado+nPom][10000] = 3

In [103]:
# Add strawberry images to array
strawberryList = open('list5.txt')

for r in range(nStrawberry):
    
        # read file data to array
        img = strawberryList.readline()[:-1]
        strawberry = plt.imread('fruits/Training/Strawberry/' + img)
        
        # convert array to greyscale
        graystrawberry = rgb2gray(strawberry)
        graystrawberry = graystrawberry.reshape(10000,1)

        # Add to 2D array
        for c in range(10000):
            fruits[r+nBanana+nAvocado+nPom+nLemon][c] = graystrawberry[c][0]
        fruits[r+nBanana+nAvocado+nPom+nLemon][10000] = 4

In [104]:
# Add orange images to array
orangeList = open('list6.txt')

for r in range(nOrange):
    
        # read file data to array
        img = orangeList.readline()[:-1]
        orange = plt.imread('fruits/Training/Orange/' + img)
        
        # convert array to greyscale
        grayorange = rgb2gray(orange)
        grayorange = grayorange.reshape(10000,1)
        
        # Add to 2D array
        for c in range(10000):
            fruits[r+nBanana+nAvocado+nPom+nLemon+nStrawberry][c] = grayorange[c][0]
        fruits[r+nBanana+nAvocado+nPom+nLemon+nStrawberry][10000] = 5

In [106]:
print() # End of data preprocessing




In [38]:
# SVM analysis

from sklearn.metrics import classification_report

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]

# train test split
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits, random_state=0)

from sklearn.svm import SVC

model2 = SVC(kernel='linear', C=1, gamma=1)

# train
model2.fit(Xtrain, ytrain)

Yhat_train = model2.predict(Xtrain)
Yhat_test = model2.predict(Xtest)

# analyze
print('SVC train R^2 score =', model2.score(Xtrain, Yhat_train))
print('SVC test  R^2 score =', model2.score(Xtest, Ytest))

report2test = classification_report(Ytest, Yhat_test, target_names={'banana','avocado','pomegranate','lemon','strawberry','orange'})
print(report2test)

SVC train R^2 score = 1.0
SVC test  R^2 score = 1.0
             precision    recall  f1-score   support

pomegranate       1.00      1.00      1.00       120
      lemon       1.00      1.00      1.00       118
    avocado       1.00      1.00      1.00       123
 strawberry       1.00      1.00      1.00       107
     banana       1.00      1.00      1.00       116
     orange       1.00      1.00      1.00       134

avg / total       1.00      1.00      1.00       718



In [50]:
# Gaussian Naive Bayes analysis

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)

Yhat_train = model.predict(Xtrain)
Yhat_test = model.predict(Xtest)

print('Bayes train R^2 score =', model.score(Xtrain, ytrain))
print('Bayes test  R^2 score =', model.score(Xtest, ytest))

report2test = classification_report(Ytest, Yhat_test, target_names={'banana','avocado','pomegranate','lemon','strawberry','orange'})
print(report2test)

Bayes train R^2 score = 0.9935004642525533
Bayes test  R^2 score = 0.9874651810584958
             precision    recall  f1-score   support

pomegranate       0.99      1.00      1.00       120
      lemon       1.00      0.95      0.97       118
    avocado       0.94      1.00      0.97       123
 strawberry       1.00      0.99      1.00       107
     banana       1.00      1.00      1.00       116
     orange       1.00      0.99      0.99       134

avg / total       0.99      0.99      0.99       718



In [51]:
# Neural Network analysis

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2))
mlp.fit(Xtrain, ytrain)
y_mlp = mlp.predict(Xtest)  # predict on new data
print('Accuracy Score:', accuracy_score(ytest, y_mlp))

Accuracy Score: 0.149025069637883


In [52]:
# Grid Search analysis

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]

# train test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits, random_state=0)


from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


params = [ {'alpha': [0.001, 0.1, 1, 10, 100,1000, 10000, 100000, 1000000]} ]
rm = Ridge()
grid = GridSearchCV(rm, params, cv=4)  # cv: # of folds

grid.fit(Xtrain, Ytrain)

grid.best_estimator_
scores = grid.cv_results_
scores['mean_test_score'] 

array([0.98337559, 0.9833756 , 0.98337563, 0.9833759 , 0.98337865,
       0.98340521, 0.98360932, 0.98424056, 0.9831563 ])

In [53]:
#Decision Tree analysis

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]

# train test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits, random_state=0)

from sklearn import tree
dTree= tree.DecisionTreeClassifier()

dTree.fit(Xtrain, Ytrain)
scoreTrain = dTree.score(Xtrain, Ytrain)
scoreTest = dTree.score(Xtest, Ytest)

print('scoreTrain= ', scoreTrain)
print('scoreTest= ', scoreTest)

scoreTrain=  1.0
scoreTest=  0.9888579387186629


In [105]:
print() # end of testing with kaggle data




In [66]:
n = 15
nCols = 10000 + 1
realBans = np.zeros((n, nCols))

banList = open('real1.txt')
    
ban = plt.imread('realBananaTest/ban1.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[0][c] = grayban[c][0]
realBans[0][10000] = 0

#-----               

ban = plt.imread('realBananaTest/ban2.jpg') 
ban = ban[:100,:100]

grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)

for c in range(10000):
    realBans[1][c] = grayban[c][0]
realBans[1][10000] = 0
        
#-----               
                         
ban = plt.imread('realBananaTest/ban3.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[2][c] = grayban[c][0]
realBans[2][10000] = 0
        
#-----               
                         
ban = plt.imread('realBananaTest/ban4.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[3][c] = grayban[c][0]
realBans[3][10000] = 0
        
#-----               
                         
ban = plt.imread('realBananaTest/ban5.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban6.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban7.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban8.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban9.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban10.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban11.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban12.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban13.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban14.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

#-----               
                         
ban = plt.imread('realBananaTest/ban15.jpg') 
ban = ban[:100,:100]
        
grayban = rgb2gray(ban)
grayban = grayban.reshape(10000, 1)
        
for c in range(10000):
    realBans[4][c] = grayban[c][0]
realBans[4][10000] = 0

In [68]:
realBans.shape

(15, 10001)

In [107]:
print() # End of preprocessing




In [73]:
#Decision Tree analysis with realBans

Xbans = realBans[:, 0:10000]
Ybans = realBans[:, 10000]
Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]

# train test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits)

from sklearn import tree
dTree= tree.DecisionTreeClassifier()

dTree.fit(Xtrain, Ytrain)
scoreTrain = dTree.score(Xtrain, Ytrain)
scoreTest = dTree.score(Xbans, Ybans)

print('scoreTrain= ', scoreTrain)
print('scoreTest= ', scoreTest)

report2test = classification_report(Ytest, Yhat_test, target_names={'banana','avocado','pomegranate','lemon','strawberry','orange'})
print(report2test)

scoreTrain=  1.0
scoreTest=  0.06666666666666667
             precision    recall  f1-score   support

pomegranate       0.14      0.15      0.14       114
      lemon       0.13      0.15      0.14        98
    avocado       0.12      0.14      0.13       112
 strawberry       0.18      0.15      0.16       129
     banana       0.13      0.11      0.12       137
     orange       0.23      0.23      0.23       128

avg / total       0.16      0.16      0.16       718



In [92]:
# Gaussian analysis wth realBans

from sklearn.naive_bayes import GaussianNB 
Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]
Xbans = realBans[:, 0:10000] # - Xtest
Ybans = realBans[:, 10000] # - Ytest

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits)

model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xbans)

In [93]:
Yhat_train = model.predict(Xtrain)
Yhat_test = model.predict(Xbans)

print('Bayes train R^2 score =', model.score(Xtrain, Ytrain))
print('Bayes test  R^2 score =', model.score(Xbans, Ybans))

Bayes train R^2 score = 0.19498607242339833
Bayes test  R^2 score = 0.8


In [95]:
# Grid Search analysis with realBans

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]
Xbans = realBans[:, 0:10000] # - Xtest
Ybans = realBans[:, 10000] # - Ytest

# train test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits)


from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


params = [ {'alpha': [0.001, 0.1, 1, 10, 100,1000, 10000, 100000, 1000000]} ]
rm = Ridge()
grid = GridSearchCV(rm, params, cv=4)  # cv: # of folds

grid.fit(Xtrain, Ytrain)

grid.best_estimator_
scores = grid.cv_results_
scores['mean_test_score'] 

grid.score(Xbans, Ybans)

0.0

In [96]:
# SVM analysis with realBans

from sklearn.metrics import classification_report

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]
Xbans = realBans[:, 0:10000] # - Xtest
Ybans = realBans[:, 10000] # - Ytest

# train test split
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfruits, Yfruits, random_state=0)

from sklearn.svm import SVC

model2 = SVC(kernel='linear', C=1, gamma=1)

# train
model2.fit(Xtrain, ytrain)

Yhat_train = model2.predict(Xtrain)
Yhat_test = model2.predict(Xtest)

# analyze
print('SVC train R^2 score =', model2.score(Xtrain, Yhat_train))
print('SVC test  R^2 score =', model2.score(Xbans, Ybans))

report2test = classification_report(Ytest, Yhat_test, target_names={'banana','avocado','pomegranate','lemon','strawberry','orange'})
print(report2test)

SVC train R^2 score = 1.0
SVC test  R^2 score = 0.06666666666666667
             precision    recall  f1-score   support

pomegranate       1.00      1.00      1.00       120
      lemon       1.00      1.00      1.00       118
    avocado       1.00      1.00      1.00       123
 strawberry       1.00      1.00      1.00       107
     banana       1.00      1.00      1.00       116
     orange       1.00      1.00      1.00       134

avg / total       1.00      1.00      1.00       718



In [97]:
# Neural Network analysis with realBans

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

Xfruits = fruits[:, 0:10000]
Yfruits = fruits[:, 10000]
Xbans = realBans[:, 0:10000] # - Xtest
Ybans = realBans[:, 10000] # - Ytest

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2))
mlp.fit(Xtrain, Ytrain)
y_mlp = mlp.predict(Xbans)  # predict on new data
print('Accuracy Score:', accuracy_score(Ybans, y_mlp))

Accuracy Score: 0.0
