In [23]:
import pylab
from nt_toolbox.graph import *
from nt_toolbox.signal import *
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import os
from PIL import Image

In [24]:
def binary(image, alpha=0.5):
    image_copy = np.zeros(image.shape)
    image_copy[image > alpha] = 1
    return image_copy

In [25]:
def has_points_in_images(img):
    '''
    img : image 
    
    return : True or False
             True : there are points inside the cell 
             False : there aren't points inside the cell
    '''
    img = img*img
    max_value = np.max(img)
    min_value = np.min(img)
    mean_value = np.mean(img)
    b_img1 = binary(img, mean_value)
    b_img2 = binary(img, (min_value + mean_value)/2)
    return abs(np.sum(b_img1-b_img2)) != 0

In [26]:
def black_pixels(img):
    '''
    img : image 
    
    return : The number of pixels of "black pixels", the pixels around the cell
    '''
    nb_pixb = 0
    for i in range(len(img)):
        pix_per_line = np.sum(sum(p >= 0.9 for p in img[i:i+1]))
        nb_pixb = nb_pixb + pix_per_line
    return nb_pixb

In [27]:
def nb_pixels_interval(image, interval):
    '''
    image : image
    interval : (value1, value2)
    
    test number of pixel whose value v is value1 <= v < value2
    '''
    seuil_left = interval[0]
    seuil_right = interval[1]
    return np.sum([image[image >= seuil_left] < seuil_right])

In [63]:
def extract(images_paths, label):
    nb_intervals = 10
    epsilon = 1e-5
    invals = [(0.1*i+epsilon, 0.1*(i+1)+epsilon)for i in range(nb_intervals)]
    p_detected = []
    pix_black = []
    var_color = []
    max_color = []
    var_gray = []
    max_gray = []
    labels = []
    
    nb_pixel_in_intervals=[[]for i in range(nb_intervals)]
    
    for ip in images_paths:
        image = load_image(ip)
        #print(image[15])
        image_grey = np.asarray(Image.open(ip).convert('L'))
        p_detected.append(1 if has_points_in_images(image) else 0)
        pix_black.append(black_pixels(image))
        var_color.append(np.array(image)[:,0].var())
        max_color.append(np.array(image)[:,0].max())
        var_gray.append(image_grey.var())
        max_gray.append(image_grey.max())
        labels.append(label)
        
        for i in range(nb_intervals):
            nb_pixel_in_intervals[i].append(nb_pixels_interval(image, invals[i]))
        
    df = pd.DataFrame()
    #df['image_path'] = images_paths
    #df['has_points'] = p_detected
    df['black_pixels'] = pix_black
    df['var_color'] = var_color
    df['max_color'] = max_color
    df['var_gray'] = var_gray
    df['max_gray'] = max_gray
    df['label'] = labels
    for i in range(nb_intervals):
        column = str((round(invals[i][0],1), round(invals[i][1],1)))
        df['nb_pixel_%s'%(column)] = nb_pixel_in_intervals[i]
    
    print(df)
    return df

## Test

In [65]:
try:
    for dirname, _, filenames in os.walk('../cell_images/Uninfected'):
        paths = []
        i=0
        for filename in filenames[:10]:
            i+=1
            if i%1000==0:
                print (i)
            image_path = os.path.join(dirname, filename)
            paths.append(image_path)
        df1 = extract(paths, 0)
except Exception:
    pass
    '''
    for filename in filenames[:100]:
        image_path = os.path.join(dirname, filename)
        if image_path.endswith('.db'):
            continue
        image = load_image(image_path)
        p_labels.append(True)
        if not has_points_in_images():
            p_detected.append(True)
        else:
            imageplot(image)
            p_detected.append(False)
            print(image_path)    

    '''
try:
    for dirname, _, filenames in os.walk('../cell_images/Parasitized'):
        paths = []
        i=0
        for filename in filenames[:10]:
            i+=1
            if i%1000==0:
                print (i)
            image_path = os.path.join(dirname, filename)
            paths.append(image_path)
        df2 = extract(paths, 1)
        
except Exception:
    pass

df = df1.append(df2)

# shuffle data
df = df.sample(frac=1)

# train = 60% / test = 20% / validation = 20%
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

features = ["has_points", "black_pixels", "var_color", "max_color", "var_gray", "max_gray"]

train.to_csv("malaria_train.data", sep=' ', columns = features, index=False)
train.to_csv("malaria_train.solution", sep=' ', columns = ["label"], index=False)

test.to_csv("malaria_test.data", sep=' ', columns = features, index=False)
test.to_csv("malaria_test.solution", sep=' ', columns = ["label"], index=False)

validate.to_csv("malaria_valid.data", sep=' ', columns = features, index=False)
validate.to_csv("malaria_valid.solution", sep=' ', columns = ["label"], index=False)

   black_pixels  var_color  max_color     var_gray  max_gray  label  \
0         14636   0.081997   0.991182  6687.291868       190      0   
1         12099   0.046926   0.989437  6184.705342       191      0   
2         14692   0.111829   0.991320  6409.304156       193      0   
3          8121   0.135396   0.989399  6216.876228       190      0   
4         10856   0.045640   0.984348  7029.260820       193      0   
5          6795   0.077428   0.883186  5519.140945       189      0   
6         11848   0.006648   0.933216  7047.477177       190      0   
7          9396   0.048130   0.933913  5674.270851       193      0   
8         15673   0.038187   0.994624  6659.647369       187      0   
9         15628   0.102757   0.989474  5507.906932       191      0   

   nb_pixel_(0.0, 0.1)  nb_pixel_(0.1, 0.2)  nb_pixel_(0.2, 0.3)  \
0                    0                    0                    0   
1                    0                    0                    0   
2             

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [66]:
df

Unnamed: 0,black_pixels,var_color,max_color,var_gray,max_gray,label,"nb_pixel_(0.0, 0.1)","nb_pixel_(0.1, 0.2)","nb_pixel_(0.2, 0.3)","nb_pixel_(0.3, 0.4)","nb_pixel_(0.4, 0.5)","nb_pixel_(0.5, 0.6)","nb_pixel_(0.6, 0.7)","nb_pixel_(0.7, 0.8)","nb_pixel_(0.8, 0.9)","nb_pixel_(0.9, 1.0)"
5,6795,0.077428,0.883186,5519.140945,189,0,0,0,0,0,0,0,0,0,3595,6795
1,7847,0.020631,0.946367,6771.751387,194,1,0,0,0,0,7,50,65,133,11747,7847
4,10856,0.04564,0.984348,7029.26082,193,0,0,0,0,0,0,0,0,0,734,10856
8,15673,0.038187,0.994624,6659.647369,187,0,0,0,0,0,0,0,0,0,792,15673
6,11848,0.006648,0.933216,7047.477177,190,0,0,0,0,0,0,0,0,0,1278,11848
6,11394,0.128842,0.977153,5855.360476,191,1,0,0,0,0,0,16,60,42,2610,11394
8,12625,0.059213,0.977312,6435.019525,190,1,0,0,0,0,1,15,39,56,258,12625
0,13881,0.087862,0.967185,5877.587663,194,1,0,0,0,1,12,37,40,94,1668,13881
7,6591,0.081668,0.963542,6041.258535,193,1,0,0,0,0,4,25,57,58,6961,6591
3,9144,0.039102,0.952462,7223.587356,197,1,0,0,0,0,1,47,152,356,4638,9144
