In [24]:
import os
import pandas as pd
import numpy as np
import pickle
import random

### Open the hashmap file for train-test split

In [6]:
path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker'
hashdict_filename = 'landfill_hashdict.txt'

In [7]:
def get_hashdict(path, hashdict_filename):
    try: 
        with open(os.path.join(path, hashdict_filename), "rb") as hashed_l: # check if the hashed location file exists  
                    hashed_locations = pickle.load(hashed_l) # load dictionary object
        return hashed_locations
    except:
        print("Check the filename and filepath: not found")
        return None

In [8]:
hash_dict = get_hashdict(path=path, hashdict_filename=hashdict_filename)

In [10]:
len(hash_dict)

2293

### Split the hashmaps in train and test sets

In [25]:
def create_train_test_sets(d, test_size=0.05):
    total_size = len(d)
    
    if type(test_size) == float:
        size = int(total_size * test_size)
    elif type(test_size) == int:
        size = test_size
    else:
        print("please check the test size parameter")
        return
    keys = random.sample(list(d.keys()), size)
    
    test_sample = {k: d[k] for k in keys}
    train_sample = {k:d[k] for k in d.keys() if k not in keys}
    
    return train_sample, test_sample

In [35]:
train_hashmap, test_hashmap = create_train_test_sets(hash_dict, test_size=0.1)

In [38]:
print("Training set size: {}".format(len(train_hashmap)))
print("Test set size: {}".format(len(test_hashmap)))

Training set size: 2064
Test set size: 229


### Saving the train and test hashmap for future reference

In [42]:
def save_train_test_dict (path, datadict ,filenames = ['training_datadict.pkl', 'test_datadict.pkl']):
    """
    path: str(path where we want to save the dictionaries)
    datadict: list(dict) (list of dictionaries containing traning and test dict)
    filenames: list(str) (list of training and test filenames)
    
    return None
    """
    
    for itr, filename in enumerate(filenames):
        with open(os.path.join(path, filename), "wb") as fw:
            pickle.dump(datadict[itr], fw, protocol=pickle.HIGHEST_PROTOCOL) # rewrite the most updated file to disk 
    return

In [43]:
save_train_test_dict(path = path, datadict = [train_hashmap, test_hashmap])

### Moving training and test files in appropriate folders

In [59]:
train_data_path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/training_data/'
test_data_path = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/test_data/'
base_image_repo = 'D:/MIDS/CAPSTONE/UCB-Capstone-Methane-Tracker/complete_image_dataset/'

In [55]:
def get_hash(img_name):
    elems = img_name.split('_')
    if len(elems) == 2:
        return int(elems[1][:-8])
    else:
        return int(elems[1])

In [62]:
def move_train_test_data(base_image_repo, train_data_path, test_data_path, test_datadict, correct_filename=False):
    
    for itr, fname in enumerate(os.listdir(base_image_repo)):
        
        if itr % 100 == 0:
            print('{} files complete'.format(itr))
        
        #print(fname)
    
        if correct_filename:
            corrected_fname = fname[:-4]

        hashmap = get_hash(fname)
        src = base_image_repo+fname

        if hashmap in test_datadict:
            dest = test_data_path+corrected_fname
            os.rename(src, dest)
        else:
            dest = train_data_path+corrected_fname
            os.rename(src, dest)
    return 

In [63]:
move_train_test_data(base_image_repo=base_image_repo, 
                     train_data_path=train_data_path, 
                     test_data_path=test_data_path, 
                     test_datadict=test_hashmap, 
                     correct_filename=True)

0 files complete
100 files complete
200 files complete
300 files complete
400 files complete
500 files complete
600 files complete
700 files complete
800 files complete
900 files complete
1000 files complete
1100 files complete
1200 files complete
1300 files complete
1400 files complete
1500 files complete
1600 files complete
1700 files complete
1800 files complete
1900 files complete
2000 files complete
2100 files complete
2200 files complete
2300 files complete
2400 files complete
2500 files complete
2600 files complete
2700 files complete
2800 files complete
2900 files complete
3000 files complete
3100 files complete
3200 files complete
3300 files complete
3400 files complete
3500 files complete
3600 files complete
3700 files complete
3800 files complete
3900 files complete
4000 files complete
4100 files complete
4200 files complete
4300 files complete
4400 files complete
4500 files complete
4600 files complete
4700 files complete
4800 files complete
4900 files complete
5000 files c