# Measuring sensitivity and specificity into the Categorize Set

The purpose of this operational research activity was to measure the **specificity** (% of **good samples** identified as good) and **sensitivity** (% of **bad samples** identified as bad) of the PAD test.  There were similar numbers of good and bad samples, so **accuracy** is the average of specificity and sensitivity. 

## Definition of bad samples
### Bad samples

```
SELECT * FROM `Analysis` WHERE `sample_name`!=`sample_actual` 

```
### Bad samples considering one class

```
SELECT * FROM `Analysis` WHERE `sample_name`!=`sample_actual` AND `sample_name`="Amoxicillin"
```



## Sample List of the categorize set 

```
./categorize_good_bad_samples.csv
```
| good/bad | id_actual | id_written | sample_actual | sample_name | proc_file_location |
| -------- | --------- | ---------- | ------------- | ----------- | ------------------ |
| good | 9 | 9 | Paracetamol| Paracetamol | /var/www/html/joomla/images/padimages/msh/processed/Analyst_30-12LanePADKenya2015-1-11771.processed.png |
| bad | 2 | 1 | Starch | Penicillin Procaine | /var/www/html/joomla/images/padimages/msh/processed/35612.processed.png |
| good | 8 | 8 | Benzyl <br>Penicillin | Benzyl <br>Penicillin | /var/www/html/joomla/images/padimages/msh/processed/Analyst_30-12LanePADKenya2015-1-3812.processed.png |



In [1]:
def get_sample_labels(fn):
    try:
        d_file = open(fn, "r")
        d_lines = d_file.readlines()
    except:
        print("Missing file.")
        exit(-1)

    sample_labels = {} 
    for dl in d_lines:
        dl = dl.rstrip('\n')
        data = dl.split(",")
        
        try:
            proc_file_location = data[-1]
            sample_labels[proc_file_location] =  data[0:]
        except:
            continue
    return sample_labels


## Group samples by sample_name

In [2]:
dict_sample_name = {}
samples_by_id_name = {}
img_locations = {}

sample_labels = get_sample_labels("groundtruth_good_bad_cards/categorize_good_bad_samples.csv")

#fill in dictionary
for sample in sample_labels :
    #is_good = 1 if(sample_labels[sample][0]=="good") else 0
    is_good = sample_labels[sample][0]
    id_actual = int(sample_labels[sample][1])
    id_name = int(sample_labels[sample][2])
    sample_actual = sample_labels[sample][3]
    sample_name = sample_labels[sample][4] 
    proc_file_location = sample_labels[sample][5]

    # print(is_good,sample_labels[sample][0], id_name, sample_name, id_actual, sample_actual )
    
    # Full samples_by_id_name
    if int(id_name) not in dict_sample_name:
        samples_by_id_name[id_name] = {}
        img_locations[id_name] = {}
        img_locations[id_name]['good']= [] 
        img_locations[id_name]['bad']= [] 
    else:
        samples_by_id_name[id_name][proc_file_location] = [ id_actual , is_good ]
        img_locations[id_name][is_good].append(proc_file_location)
                
    # Full dict_sample_name
    dict_sample_name[id_name] = sample_name


## Read - drug or distractor 

In [3]:
def read_drug_labels(drug_label_fname):
    try:
        d_file = open(drug_label_fname, "r") #open(sys.argv[1], "r")
        d_lines = d_file.readlines()
    except:
        print("Missing drug label file.")
        exit(-1)

    drugs = {}
    is_distractor = {}

    #loop
    for dl in d_lines:
        data = dl.split(",")
        try:
            drugs[int(data[1])] = data[0]
            is_distractor[int(data[1])] = False if(int(data[3])) else True 
        except:
            continue
    return drugs, is_distractor

drug_label_fname = "../../datasets/msh_tanzania_blank_drugs.csv"
drugs, is_distractor = read_drug_labels(drug_label_fname)

## Prediction 

## Mapping prediction categorize image name to proc_file_location

In [4]:
def read_image_name_and_location(fn):
    try:
        d_file = open(fn, "r")
        d_lines = d_file.readlines()
    except:
        print("Missing file.")
        exit(-1)

    dict_sample_location = {}
    for dl in d_lines:
        dl = dl.rstrip('\n')
        data = dl.split(",")
        try:
            im_name = data[1].split("/")[-1]
            proc_file_location = data[2]
            dict_sample_location[im_name] = proc_file_location
        except:
            continue
    return dict_sample_location


def read_prediction(fn):
    try:
        d_file = open(fn, "r")
        d_lines = d_file.readlines()
    except:
        print("Missing file.")
        exit(-1)

    dict_prediction = {}
    for dl in d_lines:
        dl = dl.rstrip('\n')
        data = dl.split(",")
        try:
            im_name = data[1].split("/")[-1]
            dict_prediction[im_name] = int(data[0])
        except:
            continue
    return dict_prediction


def get_prediction(prediction_fn, categorize_src_labels_fn): 
    
    cat_image_name_loc = read_image_name_and_location(categorize_src_labels_fn)
    prediction = read_prediction(prediction_fn)
    
    prediction_by_img_location={}
    for img_name in cat_image_name_loc:
        #print(img_name, prediction[img_name], cat_image_name_loc[img_name])
        prediction_by_img_location[cat_image_name_loc[img_name]]=prediction[img_name]    
    return prediction_by_img_location

## Calculate  specificity and sensitivity

In [5]:
# dict_sample_name
# img_locations
# samples_by_id_name => preciso apenas se quiser saber o sample_actual por localizacao da imagem   


# Change here for different models

# model G1-dt250-seed27, acc1 93.03, acc2 98.31 (only drugs)
#categorize_src_labels_fn = "../../datasets/1/msh_tanzania_bal-1-250/categorize/src_labels.csv" # "categorize_bal_1_250_src_labels.csv"
#prediction_fn ="prediction_msh_tanzania_bal-1-250_27.csv"

## model G2-dt375-seed15, acc1 94.35, acc2 98.31 (only drugs) 
categorize_src_labels_fn = "../../datasets/2/msh_tanzania_bal-2-375/categorize/src_labels.csv"
prediction_fn ="prediction_msh_tanzania_bal-2-375_15.csv"

# model G2-dt400-seed10, acc1 94.54, acc2 99.32 (only drugs)
#categorize_src_labels_fn = "../../datasets/2/msh_tanzania_bal-2-400/categorize/src_labels.csv"
#prediction_fn ="prediction_msh_tanzania_bal-2-400_10.csv"




## Mapping prediction categorize image name to proc_file_location
prediction_by_img_location=get_prediction(prediction_fn, categorize_src_labels_fn)


res_good_cards = {}
res_bad_cards = {}

for id_name in dict_sample_name:
    if (not is_distractor[id_name]) & (dict_sample_name[id_name]==drugs[id_name]):
        good_cards = img_locations[id_name]['good']
        bad_cards = img_locations[id_name]['bad']

        res_good_cards[id_name] = {}
        res_bad_cards[id_name] = {}

        # Sensitivity (% good as good)
        sum_correct = 0
        for g_card in good_cards:            
            predicted_id = prediction_by_img_location[g_card]
            res = 1 if (predicted_id==id_name) else 0
            sum_correct+=res
            
        res_good_cards[id_name]["total"] = len(good_cards)
        res_good_cards[id_name]["correct"] = sum_correct
        
        # Specificity (% bad as bad)
        sum_correct=0
        for g_card in bad_cards:
            predicted_id = prediction_by_img_location[g_card]
            res = 1 if (predicted_id!=id_name) else 0
            sum_correct+=res
            
        res_bad_cards[id_name]["total"] = len(bad_cards)            
        res_bad_cards[id_name]["correct"] = sum_correct
        

In [6]:
sensitiv_total=0
specific_total=0
num_good_cards=0
num_bad_cards=0

for r in res_good_cards:
    sensitivity = 100*res_good_cards[r]['correct']/res_good_cards[r]['total']
    specificity = 100*res_bad_cards[r]['correct']/res_bad_cards[r]['total']
    

    sensitiv_total+=res_good_cards[r]['correct']
    specific_total+=res_bad_cards[r]['correct']
    
    num_good_cards+=res_good_cards[r]['total']
    num_bad_cards+= res_bad_cards[r]['total']
    
    print(dict_sample_name[r])
    print("\tSensitivity %s %.2f%s" % (res_good_cards[r], sensitivity, "%"))
    print("\tSpecificity %s %.2f%s" % (res_bad_cards[r], specificity, "%"))
    print("\tAccuracy %.2f%s" % ((sensitivity+specificity)/2, "%")) 

s1 = 100*sensitiv_total/num_good_cards
s2 = 100*specific_total/num_bad_cards

print("Sensitivity Total %d/%d  %.2f%s" % (sensitiv_total, num_good_cards,s1, "%"))
print("Specificity Total %d/%d  %.2f%s" % (specific_total, num_bad_cards,s2, "%"))
print("Accuracy Total %.2f%s" % ((s1 + s2)/2, "%"))

Benzyl Penicillin
	Sensitivity {'total': 56, 'correct': 55} 98.21%
	Specificity {'total': 46, 'correct': 46} 100.00%
	Accuracy 99.11%
Paracetamol
	Sensitivity {'total': 60, 'correct': 60} 100.00%
	Specificity {'total': 46, 'correct': 44} 95.65%
	Accuracy 97.83%
Amoxicillin
	Sensitivity {'total': 45, 'correct': 45} 100.00%
	Specificity {'total': 47, 'correct': 47} 100.00%
	Accuracy 100.00%
Quinine
	Sensitivity {'total': 54, 'correct': 54} 100.00%
	Specificity {'total': 42, 'correct': 42} 100.00%
	Accuracy 100.00%
Penicillin Procaine
	Sensitivity {'total': 49, 'correct': 48} 97.96%
	Specificity {'total': 39, 'correct': 39} 100.00%
	Accuracy 98.98%
Sensitivity Total 262/264  99.24%
Specificity Total 218/220  99.09%
Accuracy Total 99.17%
