In [29]:
import pylab
from nt_toolbox.graph import *
from nt_toolbox.signal import *
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import os
from PIL import Image

In [30]:
def binary(image, alpha=0.5):
    image_copy = np.zeros(image.shape)
    image_copy[image > alpha] = 1
    return image_copy

In [31]:
def has_points_in_images(img):
    '''
    img : image 
    
    return : True or False
             True : there are points inside the cell 
             False : there aren't points inside the cell
    '''
    img = img*img
    max_value = np.max(img)
    min_value = np.min(img)
    mean_value = np.mean(img)
    b_img1 = binary(img, mean_value)
    b_img2 = binary(img, (min_value + mean_value)/2)
    return abs(np.sum(b_img1-b_img2)) != 0

In [32]:
def black_pixels(img):
    '''
    img : image 
    
    return : The number of pixels of "black pixels", the pixels around the cell
    '''
    nb_pixb = 0
    for i in range(len(img)):
        pix_per_line = np.sum(sum(p <= 0.1 for p in img[i:i+1]))
        nb_pixb = nb_pixb + pix_per_line
    return nb_pixb

In [33]:
def nb_pixels_interval(image, interval):
    '''
    image : image
    interval : (value1, value2)
    
    test number of pixel whose value v is value1 <= v < value2
    '''
    
    seuil_left = interval[0]
    seuil_right = interval[1]
    return np.sum([image[image >= seuil_left] < seuil_right])

In [48]:
def remove_black_pixels(image):
    img_flat = image.flatten()
    return img_flat[img_flat != 0]
    
def extract(images_paths, label):
    nb_intervals = 10
    epsilon = 1e-5
    invals = [(0.1*i+epsilon, 0.1*(i+1)+epsilon)for i in range(nb_intervals)]
    p_detected = []
    pix_black = []
    var_color = []
    max_color = []
    var_gray = []
    max_gray = []
    labels = []
    min_gray = []
    min_color = []
    mean_color = []
    mean_gray = []
    
    nb_pixel_in_intervals=[[]for i in range(nb_intervals)]
    
    for ip in images_paths:
        image = load_image(ip)
        image_grey = np.asarray(Image.open(ip).convert('L'))
        image_without_black_grey = remove_black_pixels(image_grey)
        image_without_black_color = remove_black_pixels(image[:,0])
        
        #p_detected.append(1 if has_points_in_images(image) else 0)
        pix_black.append(black_pixels(image))
        
        # Color
        var_color.append(image_without_black_color.var())
        mean_color.append(image_without_black_color.mean())
        min_color.append(image_without_black_color.min())
        max_color.append(image_without_black_color.max())
        
        # Grey
        var_gray.append(image_without_black_grey.var())
        mean_gray.append(image_without_black_grey.mean())
        min_gray.append(image_without_black_grey.min())
        max_gray.append(image_without_black_grey.max())
        
        labels.append(label)
        
        # Nb pixels
        for i in range(nb_intervals):
            nb_pixel_in_intervals[i].append(nb_pixels_interval(image, invals[i]))
        
    df = pd.DataFrame()
    #df['image_path'] = images_paths
    #df['has_points'] = p_detected
    df['nb_black_pixels'] = pix_black
    df['var_color'] = var_color
    df['mean_color'] = mean_color
    df['min_color'] = min_color
    df['max_color'] = max_color
    df['var_gray'] = [v/255 for v in var_gray]
    df['mean_gray'] = [v/255 for v in mean_gray]
    df['min_gray'] = [v/255 for v in min_gray]
    df['max_gray'] = [v/255 for v in max_gray]
    df['label'] = labels
    
    for i in range(nb_intervals):
        column = str((round(invals[i][0],1), round(invals[i][1],1)))
        df['nb_pixel_%s'%(column)] = nb_pixel_in_intervals[i]
    
    #print(df)
    return df

## Test

In [49]:
for dirname, _, filenames in os.walk('./cell_images/Uninfected'):
    paths = []
    i=0
    for filename in filenames:
        i+=1
        if i%1000==0:
            print (i)
        image_path = os.path.join(dirname, filename)
        paths.append(image_path)
    df1 = extract(paths, 0)

    '''
    for filename in filenames[:100]:
        image_path = os.path.join(dirname, filename)
        if image_path.endswith('.db'):
            continue
        image = load_image(image_path)
        p_labels.append(True)
        if not has_points_in_images():
            p_detected.append(True)
        else:
            imageplot(image)
            p_detected.append(False)
            print(image_path)    

    '''
    
for dirname, _, filenames in os.walk('./cell_images/Parasitized'):
    paths = []
    i=0
    for filename in filenames:
        i+=1
        if i%1000==0:
            print (i)
        image_path = os.path.join(dirname, filename)
        paths.append(image_path)
    df2 = extract(paths, 1)

df = df1.append(df2)
print(df)
# shuffle data
df = df.sample(frac=1)

# train = 60% / test = 20% / validation = 20%
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

features = ["nb_black_pixels", "var_color", "mean_color", "min_color", "max_color", "var_gray", "mean_gray", "min_gray", "max_gray", "nb_pixel_(0.0, 0.1)", "nb_pixel_(0.1, 0.2)", "nb_pixel_(0.2, 0.3)", "nb_pixel_(0.3, 0.4)", "nb_pixel_(0.4, 0.5)", "nb_pixel_(0.5, 0.6)", "nb_pixel_(0.6, 0.7)", "nb_pixel_(0.7, 0.8)", "nb_pixel_(0.8, 0.9)", "nb_pixel_(0.9, 1.0)"]

train.to_csv("malaria_train.data", sep=' ', columns = features, index=False)
train.to_csv("malaria_train.solution", sep=' ', columns = ["label"], index=False)

test.to_csv("malaria_test.data", sep=' ', columns = features, index=False)
test.to_csv("malaria_test.solution", sep=' ', columns = ["label"], index=False)

validate.to_csv("malaria_valid.data", sep=' ', columns = features, index=False)
validate.to_csv("malaria_valid.solution", sep=' ', columns = ["label"], index=False)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
       nb_black_pixels  var_color  mean_color  min_color  max_color  var_gray  \
0                 7308   0.000017    0.875000   0.870242   0.880623  0.325256   
1                 4734   0.000094    0.947941   0.929160   0.963756  0.129539   
2                 4194   0.000114    0.962820   0.950739   0.990148  0.128522   
3                 4140   0.000550    0.952568   0.921676   0.998179  0.087990   
4                 4473   0.000112    0.968503   0.953376   1.000000  0.190434   
5                 4371   0.000153    0.947769   0.929648   0.976549  0.208406   
6                 5046   0.000043    0.975932   0.965580   0.985507  0.263097   
7                 3753   0.000047    0.968035   0.957516   0.980392  0.254291   
8                 3264   0.000028    0.934305   0.926421   0.941472  0.090673   
9                 7494   0.000000    0.979058   0.97

In [None]:
print(df)