In [1]:
import os
import mahotas as mh
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import pickle
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # 10 x 8 inches
plt.gray()

<matplotlib.figure.Figure at 0x7f46f1d88d90>

###Build training set

In [84]:
def create_cluster_script_build_training_set(species_names, sizes):

    new_script_file = '/cellar/users/ramarty/Projects/ants/scripts/cluster/build_training.binary.sh'

    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(species_names)))
        out_file.write("#$ -l h_vmem=1G\n")
        out_file.write("#$ -tc 4\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set species_names=({0})\n".format(" ".join(species_names)))
        out_file.write("set sizes=({0})\n".format(" ".join(sizes)))
        out_file.write("\n")

        out_file.write("set species_name=$species_names[$SGE_TASK_ID]\n")
        out_file.write("set size=$sizes[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("python /cellar/users/ramarty/Projects/ants/scripts/python/version4.0/build_training.py $species_name $size\n")
        
        out_file.write("date\n")

In [85]:
species_names = ['cfellah', 'leptothorax']
sizes = ['50', '50']

In [86]:
create_cluster_script_build_training_set(species_names, sizes)

###Train classifier

In [22]:
def create_cluster_script_train_classifier(species_names, sizes, batch_sizes, learning_rates):

    new_script_file = '/cellar/users/ramarty/Projects/ants/scripts/cluster/version4.0/train_classifier.sh'

    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(species_names)))
        out_file.write("#$ -l h_vmem=40G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set species_names=({0})\n".format(" ".join(species_names)))
        out_file.write("set sizes=({0})\n".format(" ".join(sizes)))
        out_file.write("set batch_sizes=({0})\n".format(" ".join(batch_sizes)))
        out_file.write("set learning_rates=({0})\n".format(" ".join(learning_rates)))
        out_file.write("\n")

        out_file.write("set species_name=$species_names[$SGE_TASK_ID]\n")
        out_file.write("set size=$sizes[$SGE_TASK_ID]\n")
        out_file.write("set batch_size=$batch_sizes[$SGE_TASK_ID]\n")
        out_file.write("set learning_rate=$learning_rates[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("python /cellar/users/ramarty/Projects/ants/scripts/python/version4.0/train_classifier.py $species_name $size $batch_size $learning_rate\n")
        
        out_file.write("date\n")

In [23]:
species_names = (['cfellah']*6 + ['leptothorax']*6)*4
sizes = ['50']*48
batch_sizes = ['10']*12 + ['50']*12 + ['100']*12 + ['1000']*12
learning_rates = ['0.001', '0.01', '0.1', '1', '10', '100']*8

In [24]:
len(species_names), len(sizes), len(batch_sizes), len(learning_rates)

(48, 48, 48, 48)

In [25]:
create_cluster_script_train_classifier(species_names, sizes, batch_sizes, learning_rates)

###Run the classifiers on several photos

In [8]:
def create_cluster_script_run_classifier(test_photos, species_names, image_sizes, steps, recombinations):

    new_script_file = '/cellar/users/ramarty/Projects/ants/scripts/cluster/version4.0/run_classifier.sh'

    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(species_names)))
        out_file.write("#$ -l h_vmem=10G\n")
        out_file.write("#$ -tc 40\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set test_photos=({0})\n".format(" ".join(test_photos)))
        out_file.write("set species_names=({0})\n".format(" ".join(species_names)))
        out_file.write("set sizes=({0})\n".format(" ".join(image_sizes)))
        out_file.write("set steps=({0})\n".format(" ".join(steps)))
        out_file.write("set recombinations=({0})\n".format(" ".join(recombinations)))
        out_file.write("\n")

        out_file.write("set test_photo=$test_photos[$SGE_TASK_ID]\n")
        out_file.write("set species_name=$species_names[$SGE_TASK_ID]\n")
        out_file.write("set size=$sizes[$SGE_TASK_ID]\n")
        out_file.write("set step=$steps[$SGE_TASK_ID]\n")
        out_file.write("set recombination=$recombinations[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("python /cellar/users/ramarty/Projects/ants/scripts/python/version4.0/run_classifier.py $test_photo $species_name $size $step $recombination\n")
        
        out_file.write("date\n")

In [9]:
# test_photo, species, image_size, step, recombination
# cfellah
cfellah_test_files = list(set([x.split('.')[0] for x in os.listdir('/cellar/users/ramarty/Data/ants/photos/') if ('2017' in x) or ('2016' in x)]))[-5:]
# leptothorax
leptothorax_test_files = list(set([x.split('.')[0] for x in os.listdir('/cellar/users/ramarty/Data/ants/photos/') if ('2014' in x) or ('box101' in x)]))[-5:]

In [10]:
test_photos = cfellah_test_files*2 + leptothorax_test_files*2
species = ['cfellah' for x in range(10)] + ['leptothorax' for x in range(10)]
image_sizes = ['50' for x in range(10)] + ['50' for x in range(10)]
step = ['20' for x in range(20)]
recombination = ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)]

In [11]:
len(test_photos), len(species), len(image_sizes), len(step), len(recombination)

(20, 20, 20, 20, 20)

In [12]:
create_cluster_script_run_classifier(test_photos, species, image_sizes, step, recombination)

###Assess the accuracy of the classifier for the photos

In [48]:
def create_cluster_script_assess_classifier(test_photos, species_names, image_sizes, steps, recombinations):

    new_script_file = '/cellar/users/ramarty/Projects/ants/scripts/cluster/assess_classifier.binary.sh'

    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(species_names)))
        out_file.write("#$ -l h_vmem=10G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set test_photos=({0})\n".format(" ".join(test_photos)))
        out_file.write("set species_names=({0})\n".format(" ".join(species_names)))
        out_file.write("set sizes=({0})\n".format(" ".join(image_sizes)))
        out_file.write("set steps=({0})\n".format(" ".join(steps)))
        out_file.write("set recombinations=({0})\n".format(" ".join(recombinations)))
        out_file.write("\n")

        out_file.write("set test_photo=$test_photos[$SGE_TASK_ID]\n")
        out_file.write("set species_name=$species_names[$SGE_TASK_ID]\n")
        out_file.write("set size=$sizes[$SGE_TASK_ID]\n")
        out_file.write("set step=$steps[$SGE_TASK_ID]\n")
        out_file.write("set recombination=$recombinations[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("python /cellar/users/ramarty/Projects/ants/scripts/python/assess_classifier.py $test_photo $species_name $size $step $recombination\n")
        
        out_file.write("date\n")

In [None]:
# or maybe it can just be done by calling the name of the model??

In [49]:
test_photos = cfellah_test_files*4 + leptothorax_test_files*6
species = ['cfellah' for x in range(20)] + ['leptothorax' for x in range(30)]
image_sizes = ['30' for x in range(10)] + ['50' for x in range(10)] + ['30' for x in range(10)] + ['50' for x in range(10)] + ['100' for x in range(10)]
step = ['20' for x in range(50)]
recombination = ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)]
len(test_photos), len(species), len(image_sizes), len(step), len(recombination)

(50, 50, 50, 50, 50)

In [50]:
create_cluster_script_assess_classifier(test_photos, species, image_sizes, step, recombination)

###Applying filters

In [61]:
def create_cluster_script_apply_smoothing(test_photos, species_names, image_sizes, steps, recombinations):

    new_script_file = '/cellar/users/ramarty/Projects/ants/scripts/cluster/apply_smoothing.binary.sh'

    with open(new_script_file, 'w') as out_file:
        out_file.write("#! /bin/csh\n")
        out_file.write("#$ -V\n")
        out_file.write("#$ -S /bin/csh\n")
        out_file.write("#$ -o /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -e /cellar/users/ramarty/Data/ants/sge-system_files\n")
        out_file.write("#$ -cwd\n")
        out_file.write("#$ -t 1-{0}\n".format(len(species_names)))
        out_file.write("#$ -l h_vmem=10G\n")
        out_file.write("#$ -tc 50\n")
        out_file.write("#$ -l long")
        out_file.write("\n")

        out_file.write("set test_photos=({0})\n".format(" ".join(test_photos)))
        out_file.write("set species_names=({0})\n".format(" ".join(species_names)))
        out_file.write("set sizes=({0})\n".format(" ".join(image_sizes)))
        out_file.write("set steps=({0})\n".format(" ".join(steps)))
        out_file.write("set recombinations=({0})\n".format(" ".join(recombinations)))
        out_file.write("\n")

        out_file.write("set test_photo=$test_photos[$SGE_TASK_ID]\n")
        out_file.write("set species_name=$species_names[$SGE_TASK_ID]\n")
        out_file.write("set size=$sizes[$SGE_TASK_ID]\n")
        out_file.write("set step=$steps[$SGE_TASK_ID]\n")
        out_file.write("set recombination=$recombinations[$SGE_TASK_ID]\n")
        out_file.write("\n")

        out_file.write("date\n")
        out_file.write("hostname\n")
        out_file.write("python /cellar/users/ramarty/Projects/ants/scripts/python/apply_smoothing.py $test_photo $species_name $size $step $recombination\n")
        
        out_file.write("date\n")

In [62]:
test_photos = cfellah_test_files*4 + leptothorax_test_files*6
species = ['cfellah' for x in range(20)] + ['leptothorax' for x in range(30)]
image_sizes = ['30' for x in range(10)] + ['50' for x in range(10)] + ['30' for x in range(10)] + ['50' for x in range(10)] + ['100' for x in range(10)]
step = ['20' for x in range(50)]
recombination = ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)] + ['median' for x in range(5)] + ['mean' for x in range(5)]
len(test_photos), len(species), len(image_sizes), len(step), len(recombination)

(50, 50, 50, 50, 50)

In [63]:
create_cluster_script_apply_smoothing(test_photos, species, image_sizes, step, recombination)

Debugging

In [76]:
species = 'cfellah'
test_photo = cfellah_test_files[0]
image_size = '30'
step = '20'
recombination = 'mean'
prediction_matrix = pickle.load(open("/cellar/users/ramarty/Data/ants/gold_standard/predictions/{0}/{1}.{2}.{3}.{4}.p".format(species, test_photo, image_size, step, recombination), "rb"))
gold_standard = mh.imread('/cellar/users/ramarty/Data/ants/gold_standard/photos/{0}.png'.format(test_photo))

In [81]:
def f(x):
    if ((x[0] == 246) & (x[1] == 255) & (x[2] == 0)) | ((x[0] == 169) & (x[1] == 206) & (x[2] == 114)):
        return 1
    else:
        return 0

In [82]:
# make gold standard binary for comaprison
gs_binary = np.arange(len(gold_standard)*len(gold_standard[0])).reshape(len(gold_standard), len(gold_standard[0]))
for row in range(len(gold_standard)):
    for col in range(len(gold_standard[0])):
        gs_binary[row][col] = f(gold_standard[row][col])

In [83]:
diff = gs_binary - prediction_matrix
total_dim = gs_binary.shape[0] * gs_binary.shape[1]
tpr = diff[(diff > 0)].sum() / gs_binary.sum()
fpr = abs(diff[(diff < 0)].sum()) / (total_dim - gs_binary.sum())

In [95]:
diff

matrix([[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ..., 
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [97]:
prediction_matrix

matrix([[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ..., 
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [98]:
np.nan_to_num(prediction_matrix)

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])