In [1]:
import numpy as np
import glob
import cv2
import os
import pandas as pd

In [2]:
# List of directories present in the dataset

print(os.listdir('dataset/train/'))

['diseased', 'healthy']


In [3]:
'''Loading x i.e. images and y i.e. labels from the dataset'''

images = []
labels = []
for dir_path in glob.glob("dataset/train/*"):
    label = dir_path.split('\\')[-1]
    print(label)
    for img_path in glob.glob(os.path.join(dir_path, '*.jpg')):
        # print(img_path)
        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        img = cv2.resize(img, (256, 256))

        images.append(img)
        labels.append(label)

images = np.array(images)
labels = np.array(labels)
print(len(images))
print(len(labels))

diseased
healthy
1709
1709


In [4]:
# LabelEncodeing Labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(labels)
print(y)

[0 0 0 ... 1 1 1]


In [5]:
def feature_extractor(dataset):
    img_dataset = pd.DataFrame()
    for image in range(dataset.shape[0]):
        df = pd.DataFrame()

        input_img = dataset[image, :, :, :] # image , size, size, channels
        img = input_img
        # feature 1 - pixel values
        pixel_values = img.reshape(-1)
        df['pixel_value'] = pixel_values

        # feature 2 - gabor filter
        num = 1
        kernels = []
        for theta in range(2):
            theta = theta / 4. * np.pi
            for sigma in (1, 3):
                lamda = np.pi/4
                gamma = 0.5
                gabor_label = 'Gabor' + str(num)
                ksize=9 #kernel size
                kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma)
                kernels.append(kernel)
                fimg = cv2.filter2D(img, cv2.CV_8UC3, kernel)
                filtered_image = fimg.reshape(-1)
                df[gabor_label] = filtered_image
                num+=1

        img_dataset = pd.concat([img_dataset, df], ignore_index=True)
    return img_dataset

image_features = feature_extractor(images)
print(image_features.shape)

(336003072, 5)


In [6]:
image_features = np.expand_dims(image_features, axis=0)
x = np.reshape(image_features, (images.shape[0], -1))

In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(y_test, lr_pred)
print("Accuracy from Logistic Regression : " + str(lr_acc))

Accuracy from Logistic Regression : 0.9093567251461988


In [10]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
print(dtc_pred)

[0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0
 1 1 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1 1 0 1 0
 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 1 1 1 1 1
 1 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0
 1 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1 0
 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1
 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1
 1 0 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0
 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0 0 0 0 1 1 1
 1 0 0 1 0 1 0 0 0]


In [11]:
from sklearn.metrics import accuracy_score
dtc_acc = accuracy_score(y_test, dtc_pred)
print("Accuracy from Decision Tree Classifier : " + str(dtc_acc))

Accuracy from Decision Tree Classifier : 0.7222222222222222


In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
rfc_acc = accuracy_score(y_test, rfc_pred)
print("Accuracy from Random Forest Classifier : " + str(rfc_acc))

Accuracy from Random Forest Classifier : 0.868421052631579


In [13]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
gnb_pred = gnb.predict(x_test)
gnb_acc = accuracy_score(y_test, gnb_pred)
print("Accuracy from Gaussian Naive Bayes : " + str(gnb_acc))

Accuracy from Gaussian Naive Bayes : 0.7134502923976608


In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_acc = accuracy_score(y_test, knn_pred)
print("Accuracy from K Nearest Neighbours Classifier : " + str(knn_acc))

Accuracy from K Nearest Neighbours Classifier : 0.7105263157894737


In [16]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)
svc_acc = accuracy_score(y_test, svc_pred)
print("Accuracy from Support Vector Classifier : " + str(svc_acc))

Accuracy from Support Vector Classifier : 0.8918128654970761


In [10]:
from sklearn.linear_model import Perceptron
p = Perceptron()
p.fit(x_train, y_train)
p_pred= p.predict(x_test)
p_acc = accuracy_score(y_test, p_pred)
print("Accuracy from Perceptron : " + str(p_acc))

Accuracy from Perceptron : 0.9035087719298246


In [9]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
kf = KFold(n_splits=5)
score = cross_val_score(logReg, x, y, cv=kf)
print(score)
print(score.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[0.79239766 0.81871345 0.92397661 0.9122807  0.88856305]
0.8671862941812009
