In [27]:
import glob, os
import pandas as pd
import numpy as np
import sqlite3

In [47]:
TILE_BASE = '/Users/darylwilding-mcbride/Downloads/yolo-train-rt-3000-3600-23-apr'
PRE_ASSIGNED_FILES_DIR = '{}/pre-assigned'.format(TILE_BASE)
OVERLAY_FILES_DIR = '{}/overlay'.format(TILE_BASE)
CONVERTED_DATABASE_NAME = '/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-rt-3000-3600/HeLa_20KInt.sqlite'
MS1_CE = 10
RT_LIMIT_LOWER = 3000
RT_LIMIT_UPPER = 3600


In [68]:
SET_GAP = 10  # number of frames between training sets to avoid features appearing in more than one set
BAND_GAP = 10  # number of frames between bands
NUMBER_OF_BANDS = 4  # periods over the run from which mini sets will be taken


In [69]:
# load the file names into a dataframe
filenames = []
for file in glob.glob("{}/*.png".format(PRE_ASSIGNED_FILES_DIR)):
    filenames.append((os.path.basename(os.path.splitext(file)[0])))

In [70]:
fn_df = pd.DataFrame(filenames, columns=['name'])

In [71]:
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
ms1_frame_properties_df = pd.read_sql_query("select frame_id,retention_time_secs from frame_properties where retention_time_secs >= {} and retention_time_secs <= {} and collision_energy == {}".format(RT_LIMIT_LOWER, RT_LIMIT_UPPER, MS1_CE), db_conn)
db_conn.close()


In [72]:
ms1_frame_properties_df.head()

Unnamed: 0,frame_id,retention_time_secs
0,27937,3000.034855
1,27948,3001.215839
2,27959,3002.398227
3,27970,3003.582723
4,27981,3004.765302


In [73]:
# divide the ms1 frames into equal-size bands
bands = np.array_split(ms1_frame_properties_df, NUMBER_OF_BANDS)

In [74]:
train_percent = 0.8
valid_percent = 0.1
test_percent = 1.0 - (train_percent + valid_percent)

In [79]:
train_terms = []
valid_terms = []
test_terms = []

for band_idx,band_df in enumerate(bands):

    band_length = len(band_df)
    train_length = int(train_percent * band_length)
    valid_length = int(valid_percent * band_length)
    test_length = int(test_percent * band_length)
    
    train_start = 0
    train_stop = train_start + train_length - SET_GAP
    
    valid_start = train_start + train_length
    valid_stop = valid_start + valid_length - SET_GAP
    
    test_start = valid_start + valid_length
    test_stop = test_start + test_length - SET_GAP
    
    # split the band into three sections according to their proportions
    train_ids_df, gap_1_df, valid_ids_df, gap_2_df, test_ids_df, gap_3_df = np.split(band_df, [train_stop, valid_start, valid_stop, test_start, test_stop])

    print("band {}".format(band_idx))
    print("training set: {:.1f} to {:.1f} secs ({} frames)".format(train_ids_df.retention_time_secs.min(), train_ids_df.retention_time_secs.max(), len(train_ids_df)))
    print("validation set: {:.1f} to {:.1f} secs ({} frames)".format(valid_ids_df.retention_time_secs.min(), valid_ids_df.retention_time_secs.max(), len(valid_ids_df)))
    print("test set: {:.1f} to {:.1f} secs ({} frames)".format(test_ids_df.retention_time_secs.min(), test_ids_df.retention_time_secs.max(), len(test_ids_df)))
    
    train_terms += ['frame-' + str(s) for s in train_ids_df.frame_id]
    valid_terms += ['frame-' + str(s) for s in valid_ids_df.frame_id]
    test_terms += ['frame-' + str(s) for s in test_ids_df.frame_id]
    

band 0
training set: 3000.0 to 3106.4 secs (91 frames)
validation set: 3119.4 to 3120.6 secs (2 frames)
test set: 3133.6 to 3134.8 secs (2 frames)
band 1
training set: 3150.1 to 3256.5 secs (91 frames)
validation set: 3269.5 to 3270.7 secs (2 frames)
test set: 3283.7 to 3284.9 secs (2 frames)
band 2
training set: 3300.2 to 3406.6 secs (91 frames)
validation set: 3419.6 to 3420.8 secs (2 frames)
test set: 3433.8 to 3435.0 secs (2 frames)
band 3
training set: 3450.3 to 3556.7 secs (91 frames)
validation set: 3569.7 to 3570.9 secs (2 frames)
test set: 3583.9 to 3585.1 secs (2 frames)


In [81]:
len(valid_terms)

8