In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import os

In [2]:
dataframe = pd.read_csv('color_dataset.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,img_ids,0,1,2,3,4,5,6,7,...,246,247,248,249,250,251,252,253,254,label
0,0,0_Screen Shot 2018-06-08 at 4.59.36 PM.png,0.448382,0.065304,0.082974,0.077598,0.06575,0.048668,0.03534,0.025113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,0_Screen Shot 2018-06-08 at 4.59.49 PM.png,0.533665,0.047397,0.035155,0.040051,0.042296,0.027197,0.021097,0.021464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,0_Screen Shot 2018-06-08 at 4.59.57 PM.png,0.830653,0.023889,0.025203,0.017201,0.006146,0.002049,0.002087,0.002165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,0_Screen Shot 2018-06-08 at 5.00.03 PM.png,0.640398,0.056542,0.048911,0.066208,0.065263,0.012985,0.009908,0.008188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,0_Screen Shot 2018-06-08 at 5.00.12 PM.png,0.479378,0.035893,0.043894,0.043929,0.0616,0.057195,0.027479,0.01922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
os.listdir(
    'segmented_dataset'
)

['fresh_apple', 'stale_apple']

In [4]:
clean_data = dataframe.drop(columns=['Unnamed: 0', 'img_ids'])
clean_data = clean_data.replace({
    'fresh_apple': 0,
    'stale_apple': 1,
})
clean_data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,label
0,0.448382,0.065304,0.082974,0.077598,0.06575,0.048668,0.03534,0.025113,0.018911,0.014521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.533665,0.047397,0.035155,0.040051,0.042296,0.027197,0.021097,0.021464,0.019995,0.022035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.830653,0.023889,0.025203,0.017201,0.006146,0.002049,0.002087,0.002165,0.001855,0.002319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.640398,0.056542,0.048911,0.066208,0.065263,0.012985,0.009908,0.008188,0.009133,0.007146,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.479378,0.035893,0.043894,0.043929,0.0616,0.057195,0.027479,0.01922,0.015779,0.013301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
len(clean_data.columns)

256

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


x = clean_data.iloc[:,:-1]
y = clean_data.iloc[:,-1:]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = MinMaxScaler()
x_train_scalled = scaler.fit_transform(x_train)
x_test_scalled = scaler.transform(x_test)

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", 
         "Linear SVM", 
         "RBF SVM", 
         "Decision Tree", 
         "Random Forest", 
         "Neural Network", 
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(max_iter=5000),
    GaussianNB(),
    ]

In [8]:
for name, model in zip(names, classifiers):
  model.fit(x_train_scalled, y_train.values.ravel())
  y_pred_model = model.predict(x_test_scalled)
  print(f'## MODEL NAME : {name}')
  print(confusion_matrix(y_test, y_pred_model))
  print(classification_report(y_test, y_pred_model))
  print('\n')

## MODEL NAME : Nearest Neighbors
[[85 10]
 [20 96]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85        95
           1       0.91      0.83      0.86       116

    accuracy                           0.86       211
   macro avg       0.86      0.86      0.86       211
weighted avg       0.86      0.86      0.86       211



## MODEL NAME : Linear SVM
[[ 78  17]
 [ 10 106]]
              precision    recall  f1-score   support

           0       0.89      0.82      0.85        95
           1       0.86      0.91      0.89       116

    accuracy                           0.87       211
   macro avg       0.87      0.87      0.87       211
weighted avg       0.87      0.87      0.87       211



## MODEL NAME : RBF SVM
[[ 90   5]
 [ 11 105]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        95
           1       0.95      0.91      0.93       116

    accuracy                

In [9]:
correlation = {}
label = clean_data['label']
for col in clean_data.columns:
  correlation[col] = label.corr(clean_data[col])
sorted_correlation = sorted(correlation.items(), key=lambda x: x[1])
print(sorted_correlation)
len(sorted_correlation)

[('2', -0.3809667238777945), ('3', -0.3635436535459522), ('1', -0.36057726169086474), ('90', -0.34653594794286424), ('0', -0.3233586959756294), ('72', -0.3103396190253362), ('69', -0.30910530952442183), ('68', -0.30845452899271375), ('80', -0.3009834361730649), ('67', -0.2961518083421564), ('79', -0.2925875086273852), ('70', -0.2925524483261366), ('75', -0.2899672940762185), ('73', -0.2877944766864095), ('88', -0.28296515596956195), ('77', -0.2823810543545445), ('78', -0.2809214458910905), ('4', -0.2807205045796277), ('83', -0.2788332274893275), ('76', -0.2781731638483435), ('74', -0.2760503321350087), ('81', -0.2733160823561808), ('82', -0.267874792891348), ('89', -0.26761088780150427), ('66', -0.2639505394061223), ('87', -0.2574730535681402), ('71', -0.25710455172424473), ('86', -0.2531177287915807), ('84', -0.245791418690738), ('65', -0.24083707198244542), ('91', -0.2278032055073932), ('85', -0.22570105724346998), ('120', -0.21847615322498312), ('93', -0.21803110720552707), ('92', -

256

In [10]:
positive_corr = {}
negative_corr = {}
for (k,v) in sorted_correlation:
  if(v>0):
    positive_corr[k] = v
  else:
    negative_corr[k] = v
len(positive_corr)

129

In [11]:
from sklearn.decomposition import PCA

pca = PCA(n_components = len(positive_corr))
x_train_pca = pca.fit_transform(x_train_scalled)
x_test_pca = pca.transform(x_test_scalled)

In [12]:
for name, model in zip(names, classifiers):
  model.fit(x_train_pca, y_train.values.ravel())
  y_pred_model = model.predict(x_test_pca)
  try:
    print(f'## MODEL NAME : {name}')
    print(confusion_matrix(y_test, y_pred_model))
    print(classification_report(y_test, y_pred_model))
    print('\n')
  except:
    print("Hello!!!!")

## MODEL NAME : Nearest Neighbors
[[85 10]
 [19 97]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85        95
           1       0.91      0.84      0.87       116

    accuracy                           0.86       211
   macro avg       0.86      0.87      0.86       211
weighted avg       0.87      0.86      0.86       211



## MODEL NAME : Linear SVM
[[ 78  17]
 [ 10 106]]
              precision    recall  f1-score   support

           0       0.89      0.82      0.85        95
           1       0.86      0.91      0.89       116

    accuracy                           0.87       211
   macro avg       0.87      0.87      0.87       211
weighted avg       0.87      0.87      0.87       211



## MODEL NAME : RBF SVM
[[ 90   5]
 [ 11 105]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        95
           1       0.95      0.91      0.93       116

    accuracy                