In [1]:
import pylab
from nt_toolbox.graph import *
from nt_toolbox.signal import *
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import os
from PIL import Image

In [2]:
def binary(image, alpha=0.5):
    image_copy = np.zeros(image.shape)
    image_copy[image > alpha] = 1
    return image_copy

In [3]:
def has_points_in_images(img):
    '''
    img : image 
    
    return : True or False
             True : there are points inside the cell 
             False : there aren't points inside the cell
    '''
    img = img*img
    max_value = np.max(img)
    min_value = np.min(img)
    mean_value = np.mean(img)
    b_img1 = binary(img, mean_value)
    b_img2 = binary(img, (min_value + mean_value)/2)
    return abs(np.sum(b_img1-b_img2)) != 0

In [4]:
def black_pixels(img):
    '''
    img : image 
    
    return : The number of pixels of "black pixels", the pixels around the cell
    '''
    nb_pixb = 0
    for i in range(len(img)):
        pix_per_line = np.sum(sum(p >= 0.9 for p in img[i:i+1]))
        nb_pixb = nb_pixb + pix_per_line
    return nb_pixb

In [5]:
def extract(images_paths, label):
    p_detected = []
    pix_black = []
    var_color = []
    max_color = []
    var_gray = []
    max_gray = []
    labels = []
    for ip in images_paths:
        image = load_image(ip)
        #print(image[15])
        image_grey = np.asarray(Image.open(ip).convert('L'))
        p_detected.append(1 if has_points_in_images(image) else 0)
        pix_black.append(black_pixels(image))
        var_color.append(np.array(image)[:,0].var())
        max_color.append(np.array(image)[:,0].max())
        var_gray.append(image_grey.var())
        max_gray.append(image_grey.max())
        labels.append(label)
        
    df = pd.DataFrame()
    #df['image_path'] = images_paths
    df['has_points'] = p_detected
    df['black_pixels'] = pix_black
    df['var_color'] = var_color
    df['max_color'] = max_color
    df['var_gray'] = var_gray
    df['max_gray'] = max_gray
    df['label'] = labels
    
    return df

## Test

In [6]:
try:
    for dirname, _, filenames in os.walk('../cell_images/Uninfected'):
        paths = []
        i=0
        for filename in filenames[:]:
            i+=1
            if i%1000==0:
                print (i)
            image_path = os.path.join(dirname, filename)
            paths.append(image_path)
        df1 = extract(paths, 0)
except Exception:
    pass
    '''
    for filename in filenames[:100]:
        image_path = os.path.join(dirname, filename)
        if image_path.endswith('.db'):
            continue
        image = load_image(image_path)
        p_labels.append(True)
        if not has_points_in_images():
            p_detected.append(True)
        else:
            imageplot(image)
            p_detected.append(False)
            print(image_path)    

    '''
try:
    for dirname, _, filenames in os.walk('../cell_images/Parasitized'):
        paths = []
        i=0
        for filename in filenames[:]:
            i+=1
            if i%1000==0:
                print (i)
            image_path = os.path.join(dirname, filename)
            paths.append(image_path)
        df2 = extract(paths, 1)
except Exception:
    pass

df = df1.append(df2)

# shuffle data
df = df.sample(frac=1)

# train = 60% / test = 20% / validation = 20%
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

features = ["has_points", "black_pixels", "var_color", "max_color", "var_gray", "max_gray"]

train.to_csv("malaria_train.data", sep=' ', columns = features, index=False)
train.to_csv("malaria_train.solution", sep=' ', columns = ["label"], index=False)

test.to_csv("malaria_test.data", sep=' ', columns = features, index=False)
test.to_csv("malaria_test.solution", sep=' ', columns = ["label"], index=False)

validate.to_csv("malaria_valid.data", sep=' ', columns = features, index=False)
validate.to_csv("malaria_valid.solution", sep=' ', columns = ["label"], index=False)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


NameError: name 'df1' is not defined

In [None]:
df