In [34]:
import os
import pandas as pd
import numpy as np
import pickle
import random
import shutil

### Open the hashmap file for train-test split

In [2]:
landfil_locations_filename = 'waste_atlas_data/waste_atlas_latLongs.csv'
main_directory_path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker'

hashed_locations = {} # if file is not available, instnatiate wit a blank dictionary

filepath = os.path.join(main_directory_path, landfil_locations_filename) # path of the location data file
df = pd.read_csv(filepath)

for index in range(len(df)):
    lat = float(df.loc[index, "Latitude"]) # "Lattitude column name is hard coded"
    long = float(df.loc[index, "Longitude"]) # "Longitude column name is hard coded"
        
    if (not lat) or (not long):
        continue

    _tup = (long, lat)
    hashmap = hash(_tup) # Create the hash of the tuple (long, lat)
        
    hashed_locations[hashmap] = _tup

In [3]:
# path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker'
# hashdict_filename = 'landfill_hashdict.txt'

path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker'

In [7]:
def get_hashdict(path, hashdict_filename):
    try: 
        with open(os.path.join(path, hashdict_filename), "rb") as hashed_l: # check if the hashed location file exists  
                    hashed_locations = pickle.load(hashed_l) # load dictionary object
        return hashed_locations
    except:
        print("Check the filename and filepath: not found")
        return None

In [4]:
# hash_dict = get_hashdict(path=path, hashdict_filename=hashdict_filename)
hash_dict = hashed_locations 

In [5]:
len(hash_dict)

507

### Split the hashmaps in train and test sets

In [6]:
def create_train_test_sets(d, test_size=0.05):
    total_size = len(d)
    
    if type(test_size) == float:
        size = int(total_size * test_size)
    elif type(test_size) == int:
        size = test_size
    else:
        print("please check the test size parameter")
        return
    keys = random.sample(list(d.keys()), size)
    
    test_sample = {k: d[k] for k in keys}
    train_sample = {k:d[k] for k in d.keys() if k not in keys}
    
    return train_sample, test_sample

In [7]:
train_hashmap, test_hashmap = create_train_test_sets(hash_dict, test_size=0.1)

In [8]:
print("Training set size: {}".format(len(train_hashmap)))
print("Test set size: {}".format(len(test_hashmap)))

Training set size: 457
Test set size: 50


### Saving the train and test hashmap for future reference

In [26]:
def save_train_test_dict (path, datadict ,filenames = ['training_datadict.pkl', 'test_datadict.pkl']):
    """
    path: str(path where we want to save the dictionaries)
    datadict: list(dict) (list of dictionaries containing traning and test dict)
    filenames: list(str) (list of training and test filenames)
    
    return None
    """
    
    for itr, filename in enumerate(filenames):
        with open(os.path.join(path, filename), "wb") as fw:
            pickle.dump(datadict[itr], fw, protocol=pickle.HIGHEST_PROTOCOL) # rewrite the most updated file to disk 
    return

In [43]:
save_train_test_dict(path = path, datadict = [train_hashmap, test_hashmap])

#### Saving the additional data into old train and test hash maps

In [9]:
training_data_dict_name = 'training_datadict.pkl'
test_data_dict_name = 'test_datadict.pkl'

with open(os.path.join(path, training_data_dict_name), "rb") as training_loc:  
    train_locs = pickle.load(training_loc)
    
with open(os.path.join(path, test_data_dict_name), "rb") as test_loc:  
    test_locs = pickle.load(test_loc)

In [22]:
full_training_dict = {**train_locs, **train_hashmap}
full_test_dict = {**test_locs, **test_hashmap}

In [24]:
len(full_training_dict)

2521

In [27]:
save_train_test_dict(path = path, 
                     datadict = [full_training_dict, full_test_dict], 
                     filenames = ['full_training_datadict.pkl', 'full_test_datadict.pkl'])

In [30]:
d = {}
c = 0
for k, v in train_hashmap.items():
    if c <= 101:
        d[k] = 'lalit'
    elif c <= 203:
        d[k] = 'sonya'
    elif c <= 305:
        d[k] = 'brian'
    elif c <= 407:
        d[k] = 'michael'
    else:
        d[k] = 'prakhar'
    c += 1

In [31]:
len(d)

457

### Moving training and test files in appropriate folders

In [28]:
train_data_path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/training_data/'
test_data_path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/test_data/'
base_image_repo = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/complete_image_dataset/'

In [33]:
def get_hash(img_name):
    elems = img_name.split('_')
    if len(elems) == 2:
        return int(elems[1][:-4])
    else:
        return int(elems[1])

In [35]:
def get_destination_folder(name):
    if name == 'lalit':
        dest = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/lalit/training'
    elif name == 'sonya':
        dest = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/sonya/training'
    elif name == 'bryan':
        dest = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/bryan/training'
    elif name == 'michael':
        dest = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/michael/training'
    else:
        dest = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/prakhar/training'
    return dest

In [37]:
def move_train_test_tag_data(base_image_repo, train_data_path, test_data_path, test_datadict, map_dict):
    
    for itr, fname in enumerate(os.listdir(base_image_repo)):
        
        if itr % 10 == 0:
            print('{} files complete'.format(itr))

        hashmap = get_hash(fname)
        src = base_image_repo+fname

        if hashmap in test_datadict:
            shutil.copy(src=src, 
                        dst='D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/data_tagging/prakhar/test')
            dest = test_data_path+fname
            os.rename(src, dest)
        else:
            copy_dest = get_destination_folder(map_dict[hashmap])
            
            if fname.startswith('BASE'):
                shutil.copy(src=src, dst=copy_dest)
            
            dest = train_data_path+fname
            os.rename(src, dest)
    return 

In [38]:
move_train_test_tag_data(base_image_repo=base_image_repo, 
                         train_data_path=train_data_path, 
                         test_data_path=test_data_path, 
                         test_datadict=full_test_dict, 
                         map_dict=d)

0 files complete
10 files complete
20 files complete
30 files complete
40 files complete
50 files complete
60 files complete
70 files complete
80 files complete
90 files complete
100 files complete
110 files complete
120 files complete
130 files complete
140 files complete
150 files complete
160 files complete
170 files complete
180 files complete
190 files complete
200 files complete
210 files complete
220 files complete
230 files complete
240 files complete
250 files complete
260 files complete
270 files complete
280 files complete
290 files complete
300 files complete
310 files complete
320 files complete
330 files complete
340 files complete
350 files complete
360 files complete
370 files complete
380 files complete
390 files complete
400 files complete
410 files complete
420 files complete
430 files complete
440 files complete
450 files complete
460 files complete
470 files complete
480 files complete
490 files complete
500 files complete
510 files complete
520 files complete
530 

### Old pipeline

In [62]:
def move_train_test_data(base_image_repo, train_data_path, test_data_path, test_datadict, correct_filename=False):
    
    for itr, fname in enumerate(os.listdir(base_image_repo)):
        
        if itr % 100 == 0:
            print('{} files complete'.format(itr))
        
        #print(fname)
    
        if correct_filename:
            corrected_fname = fname[:-4]

        hashmap = get_hash(fname)
        src = base_image_repo+fname

        if hashmap in test_datadict:
            dest = test_data_path+corrected_fname
            os.rename(src, dest)
        else:
            dest = train_data_path+corrected_fname
            os.rename(src, dest)
    return 

In [63]:
move_train_test_data(base_image_repo=base_image_repo, 
                     train_data_path=train_data_path, 
                     test_data_path=test_data_path, 
                     test_datadict=test_hashmap, 
                     correct_filename=True)

0 files complete
100 files complete
200 files complete
300 files complete
400 files complete
500 files complete
600 files complete
700 files complete
800 files complete
900 files complete
1000 files complete
1100 files complete
1200 files complete
1300 files complete
1400 files complete
1500 files complete
1600 files complete
1700 files complete
1800 files complete
1900 files complete
2000 files complete
2100 files complete
2200 files complete
2300 files complete
2400 files complete
2500 files complete
2600 files complete
2700 files complete
2800 files complete
2900 files complete
3000 files complete
3100 files complete
3200 files complete
3300 files complete
3400 files complete
3500 files complete
3600 files complete
3700 files complete
3800 files complete
3900 files complete
4000 files complete
4100 files complete
4200 files complete
4300 files complete
4400 files complete
4500 files complete
4600 files complete
4700 files complete
4800 files complete
4900 files complete
5000 files c