# IDEA
-----------

We have seen that the data in outdated and mislabeled. Fitting a model to mislabeled and outdated data would not be appropriate to compare the model performance to human level. Here we employ a Bootstrapping mechanism to correct mislabels.

* We manually collect different variety of land and house images (1000 each) and make sure that the labels for collected images are consistent with the images.
* We start the regular bootstrap process.
    * We fit a model with these 2000 images and save the checkpoints to the disk.
    * We test the model on our large test corpus.
    * We place a rule based filtering criteria, such as, 
        * Find all misclassified images where the model prediction is 100% for 2 or more checkpoints 
        * Flag if the parcel was cropped based on bounding box.
        * And few other varying criteria.
        This is preferable because, let say that 2 checkpoints predicted an images with 100% house and we were able to flag it at a cropped region. Given the fact that 95% of images that were cropped based on building polygon were images of house. We can with a very good certainity say that the images is actually a house and was mislabeled as land.
    * Items that made it thought the filtering process are relabeled and the images are placed into the new labeled directories.
* Iterate: over and over

In [1]:
from __future__ import division, print_function, absolute_import
import logging
import time
import tensorflow as tf
import os
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
from config import pathDict
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")


from conv_net.train import Train
from conv_net.test import Test
from viz_analysis.mislabels_correction import GetMislabels
from data_transformation.data_prep import get_intersecting_images_pin, DumpBatches
from plot import Plot



which_net = 'resnet'
batch_prepare = True
train = True
test = True

INPUT the RUN NAME: Options : ("A new run name" or "Any Previous Run Name"
 WHICH_RUN = mislabel_c
INPUT: Image-type OPTIONS: (assessor, assessor_code, aerial, overlayed, aerial_cropped, streetside and ensemble 
 IMAGE_TYPE = aerial_cropped


In [2]:
# from clean_directories import clean
# clean(dict(overlayed='summary,batch'),which_vendor='google', which_model='resnet')

## RESTART WITH NEW_TEST: PREPARE/DUMP BATCHES:

In [3]:
start_time = time.time()

max_batches = None

if batch_prepare:
    start_time = time.time()
    
    cmn_land_pins, cmn_house_pins = get_intersecting_images_pin(is_assessor=False, is_aerial=False,
                                                                is_streetside=False, is_overlayed=False,
                                                                is_aerial_cropped=True, equal_proportion=True)
    
    print(len(cmn_land_pins), len(cmn_house_pins))
    
    tr_batch_size = 128
    ts_batch_size = (len(cmn_land_pins) + len(cmn_house_pins)) // 10
    cv_batch_size = (len(cmn_land_pins) + len(cmn_house_pins)) // 10
    
    params = dict(
            image_type='aerial_cropped',
            img_in_shape=[400, 400, 3],
            img_out_shape=[224, 224, 3],
            img_resize_shape=[128, 128, 3],
            img_crop_shape=[128, 128, 3],
            tr_batch_size = tr_batch_size,
            cv_batch_size = cv_batch_size,
            ts_batch_size = ts_batch_size,
            enable_rotation=True,
            shuffle_seed=913,
            get_stats=True,
            max_batches=max_batches)
    
    obj_cb = DumpBatches(params)
    obj_cb.dumpStratifiedBatches_balanced_class(cmn_land_pins, cmn_house_pins, is_cvalid_test=False)

print ('--------------- %s seconds ------------------'%(time.time() - start_time))


5688 5688
11376 11376 11376 11376 113767
--------------- 429.9079489707947 seconds ------------------


# TEST
-------------------

# RESTART Notebook and point it to run mislabel_c
#### We Test on the Batches present in Mislabel_c

In [2]:
which_data = 'test'
checkpoint_name = "all"
tsoj = Test(params=dict(pprocessor_inp_img_shape=[224,224,3],
                        pprocessor_inp_crop_shape=[],
                        model_inp_img_shape=[224, 224, 3]),
                    device_type = 'gpu',
                    which_net='resnet')
fnl_tst_metric_stack = tsoj.run(use_checkpoint_for_run='new_test',
                                use_checkpoint_for_imageType='aerial_cropped',
                                which_checkpoint=checkpoint_name,
                                optional_batch_name=None,
                                which_data=which_data,
                                dump_stats=True)

Test Graphs: RESNET
Learning Rate: Initial:  0.0005
Batch path C:\Users\newline\Documents\ImageClassification\data\batch_data\mislabel_c\aerial_cropped, batch_names: ['batch_0', 'batch_1', 'batch_10', 'batch_11', 'batch_12', 'batch_13', 'batch_14', 'batch_15', 'batch_16', 'batch_17', 'batch_18', 'batch_19', 'batch_2', 'batch_20', 'batch_21', 'batch_22', 'batch_23', 'batch_24', 'batch_25', 'batch_26', 'batch_27', 'batch_28', 'batch_29', 'batch_3', 'batch_30', 'batch_31', 'batch_32', 'batch_33', 'batch_34', 'batch_35', 'batch_36', 'batch_37', 'batch_38', 'batch_39', 'batch_4', 'batch_40', 'batch_41', 'batch_42', 'batch_43', 'batch_44', 'batch_45', 'batch_46', 'batch_47', 'batch_48', 'batch_49', 'batch_5', 'batch_50', 'batch_51', 'batch_52', 'batch_53', 'batch_54', 'batch_55', 'batch_56', 'batch_57', 'batch_58', 'batch_59', 'batch_6', 'batch_60', 'batch_61', 'batch_62', 'batch_63', 'batch_64', 'batch_65', 'batch_66', 'batch_67', 'batch_68', 'batch_69', 'batch_7', 'batch_70', 'batch_71', '

In [3]:
prediction_outcomes_path = os.path.join(pathDict['statistics_path'], 'prediction_stats', 'test_pred_outcomes.csv')
prediction_metrics_path = os.path.join(pathDict['statistics_path'], 'prediction_stats', 'test_pred_metrics.csv')
prediction_outcomes = pd.read_csv(prediction_outcomes_path)
prediction_metrics = pd.read_csv(prediction_metrics_path)
print (prediction_metrics.shape)
prediction_metrics.head()

(435, 6)


Unnamed: 0,checkpoint,dataset_type,test_loss,test_acc,test_precsion,test_recall
0,epoch_25_batch_17,batch_0,1.373,0.883,0.866,0.906
1,epoch_25_batch_17,batch_1,0.724,0.93,0.91,0.953
2,epoch_25_batch_17,batch_10,1.43,0.875,0.853,0.906
3,epoch_25_batch_17,batch_11,1.389,0.852,0.857,0.844
4,epoch_25_batch_17,batch_12,1.65,0.883,0.866,0.906


In [4]:
np.unique(prediction_metrics["checkpoint"])

array(['epoch_25_batch_17', 'epoch_26_batch_17', 'epoch_27_batch_17',
       'epoch_28_batch_17', 'epoch_29_batch_17'], dtype=object)

In [5]:
print (prediction_outcomes.shape)
prediction_outcomes.tail()

(55680, 6)


Unnamed: 0,checkpoint,rownum,dataset_type,true_label,pred_label,pred_prob
55675,epoch_29_batch_17,123,batch_9,1.0,1,1.0
55676,epoch_29_batch_17,124,batch_9,1.0,1,1.0
55677,epoch_29_batch_17,125,batch_9,1.0,1,1.0
55678,epoch_29_batch_17,126,batch_9,1.0,1,1.0
55679,epoch_29_batch_17,127,batch_9,1.0,1,1.0


# MISLABEL STATISTICS

In [6]:
min_pred_prob = 1
chkpnt_dict = {"epoch_28_batch_17":1}
bbox_cropped = True
(mislabeled_data, land_mis_pins_path, house_mis_pins_path, 
 land_title_arr, house_title_arr) = GetMislabels(which_data=None).main(checkpoint_min_prob_dict = chkpnt_dict,
                                                                          bbox_cropped=bbox_cropped)
pd.concat([mislabeled_data[mislabeled_data["property_type"] == 'land'].head(),
           mislabeled_data[mislabeled_data["property_type"] == 'house'].head()])

(55680, 6) (11216, 5)
epoch_28_batch_17_pred_prob >= 1 & true_label-epoch_28_batch_17_pred_label!=0 & ((property_type=='land' & bbox_cropped==1) | (property_type=='house' & bbox_cropped==0))
34 104
34 104


Unnamed: 0,property_pins,property_type,bbox_cropped,true_label,epoch_28_batch_17_pred_label,epoch_28_batch_17_pred_prob
166,14-32-411-081-0000,land,1,0,1.0,1.0
424,16-08-204-013-0000,land,1,0,1.0,1.0
945,21-32-209-034-0000,land,1,0,1.0,1.0
1161,16-16-215-079-1007,land,1,0,1.0,1.0
1324,21-31-129-013-0000,land,1,0,1.0,1.0
65,20-02-401-022-0000,house,0,1,0.0,1.0
246,17-10-214-016-1217,house,0,1,0.0,1.0
347,17-09-329-021-1123,house,0,1,0.0,1.0
493,21-30-114-029-1147,house,0,1,0.0,1.0
511,16-04-416-002-0000,house,0,1,0.0,1.0


## Store the mislabel data into disk

In [7]:
data_path = r"C:\Users\newline\Documents\ImageClassification\data\statistics\mislabel_c\aerial_cropped\mislabeled_correction"
files = os.listdir(data_path)
if len(files) == 0:
    file_name = "1"
    mislabeled_data.to_csv(os.path.join(data_path, file_name+'.csv'))
else:
    file_name = int([f.split('.')[0] for f in files][0])+1
    mislabeled_data.to_csv(os.path.join(data_path, str(file_name)+'.csv'))

In [8]:
actually_land_pins = [pins
                       for pins in np.array(
                           mislabeled_data[mislabeled_data['property_type'] == 'house']["property_pins"]
                       )
                      ]
actually_house_pins = [pins
                       for pins in np.array(
                           mislabeled_data[mislabeled_data['property_type'] == 'land']["property_pins"]
                       )
                      ]

In [9]:
source_land_path = r"C:\Users\newline\Documents\ImageClassification\data\input_images\sam_new\aerial_cropped\land"
source_house_path = r"C:\Users\newline\Documents\ImageClassification\data\input_images\sam_new\aerial_cropped\house"

dest_land_path = r"C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\land"
dest_house_path = r"C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\house"

print ('source_land_path \n', source_land_path)
print ('dest_land_path \n', dest_land_path)
print ('source_house_path \n', source_house_path)
print ('dest_house_path \n', dest_house_path)

print ('Total New Land = %s, Total New House = %s'%(str(len(actually_land_pins)), len(actually_house_pins)))
should_move = input('Are you sure you want to move file : yes/no')

import shutil

if should_move == 'yes':
    total_copied_house = 0
    total_copied_land = 0
    for num, pins in enumerate(actually_house_pins):
        source = os.path.join(source_land_path, pins+'.jpg')
        destination = os.path.join(dest_house_path, pins+'.jpg')
        if os.path.exists(source):
            total_copied_house += 1
            shutil.copyfile(source, destination)
            b = "TOTAL IMAGE COPIED: ======== %s"
            print(b % (num), end="\r")
    
    for num, pins in enumerate(actually_land_pins):
        source = os.path.join(source_house_path, pins+'.jpg')
        destination = os.path.join(dest_land_path, pins+'.jpg')
        if os.path.exists(source):
            total_copied_land += 1
            shutil.copyfile(source, destination)
            b = "TOTAL IMAGE COPIED: ======== %s"
            print(b % (num), end="\r")
    print ('Total Copied house=%s, land=%s'%(str(total_copied_house), str(total_copied_land)))

source_land_path 
 C:\Users\newline\Documents\ImageClassification\data\input_images\sam_new\aerial_cropped\land
dest_land_path 
 C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\land
source_house_path 
 C:\Users\newline\Documents\ImageClassification\data\input_images\sam_new\aerial_cropped\house
dest_house_path 
 C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\house
Total New Land = 104, Total New House = 34
Are you sure you want to move file : yes/noyes
Total Copied house=34, land=1043


## When things dont work as expected. We may need to remove images that we copied using bootstraping

In [13]:
import pandas as pd
import os
import numpy as np
bstrap_image_path = r'C:\Users\newline\Documents\ImageClassification\data\statistics\mislabel_c\aerial_cropped\mislabeled_correction'
file_1 = pd.read_csv(os.path.join(bstrap_image_path, '1.csv'))
file_2 = pd.read_csv(os.path.join(bstrap_image_path, '2.csv'))
pins = np.array(file_1["property_pins"])
pins = np.append(pins, np.array(file_2["property_pins"]))

array(['20-07-210-007-0000', '17-10-111-014-1756', '20-35-303-084-0000',
       '20-21-420-013-0000', '20-14-207-015-0000', '21-31-110-016-0000',
       '14-32-411-081-0000', '13-12-232-030-0000', '11-31-401-104-1036',
       '21-31-408-011-0000', '16-26-216-030-0000', '20-04-328-013-0000',
       '20-20-111-047-0000', '21-30-114-028-1044', '20-18-313-027-0000',
       '20-23-416-023-0000', '20-18-217-015-0000', '14-21-110-048-1713',
       '16-16-219-072-0000', '20-07-409-045-0000', '20-16-212-012-0000',
       '16-09-114-044-0000', '20-35-113-048-0000', '16-04-416-002-0000',
       '19-22-202-043-0000', '13-25-115-015-0000', '20-17-404-025-0000',
       '14-05-103-041-0000', '16-08-210-020-0000', '21-31-217-031-0000',
       '19-18-122-023-0000', '25-28-205-021-0000', '17-10-111-014-1666',
       '16-08-420-044-0000', '16-13-311-039-0000', '21-31-303-032-0000',
       '21-32-103-005-0000', '14-32-109-014-0000', '13-32-211-038-0000',
       '20-29-221-026-0000', '16-23-408-035-0000', 

In [16]:
for i in pins:
    path = r'C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\house'
    path2 = r'C:\Users\newline\Documents\ImageClassification\data\input_images\new_test\aerial_cropped\land'
    fullpath = os.path.join(path, str(i)+'.jpg')
    fullpath2 = os.path.join(path2, str(i)+'.jpg')
    a = 0
    if os.path.exists(fullpath):
        a+=1
        os.remove(fullpath)
    elif os.path.exists(fullpath2):
        a+=1
        os.remove(fullpath2)   
    else:
        pass

In [17]:
a

1