In [None]:
"""
Developed by: Vishnu Swaminathan

"""
import numpy as np
import pandas as pd

In [None]:
"""
The tube_selector method has two arguments: the first a sample-tube-volume dataframe ('stv_df'), and the second an assay volume dictionary ('av_dict').
This method returns a nested dictionary containing the sample ID as the key, and the set of assays with their corresponding tube_id.
"""
def tube_selector(stv_df, av_dict):
  """An important assumption I made, was that the dataframe used to call this method would be for only ONE sample type."""

  """The first cell will always contain the sample id (given previous assumptions)."""
  sample_id = stv_df.iat[0,0]
  """
  For better performance considerations with using vectorization, I thought it would be be useful to go from pandas to numpy.
  Hence, I converted the stv pandas dataframe ('stv_df') to the stv numpy ndarray ('stv_np').
  """
  stv_np = stv_df.to_numpy()
  """
  In order to efficiently search the ndarray later, it would make sense to sort the ndarray by the last column (volume).
  I went with using stable (i.e. timsort), giving us a time complexity of O(n*log(n)). 
  Although "heapsort" may provide better performance, at least with space complexity, heap sort swaps data and that means two reads + two writes each swap.
  """
  stv_np = stv_np[np.argsort(stv_np[:, -1], kind="stable")]
  """I also sorted the assay volume dictionary (ascending), so that lower volumed assays would be prioritized first."""
  av_dict = dict(sorted(av_dict.items(), key=lambda item: item[1]))

  """The for-loop will iterate through all the assays in the av_dict."""
  assays = {}
  for key in av_dict.keys():
    """It is important to check first whether there are even tubes available, hence if the stv_np size is 0, there are no tubes for that assay."""
    if np.size(stv_np)==0:
      assays.update({key:"no tube"})
      continue

    """
    I can get the index of the tube id that fits our assay requirements, by using numpy's searchsorted method.
    As mentioned, only because I already sorted the stv ndarray earlier, can I now use this function.
    The returned index value is where the current assay volume requirement would be inserted to maintain order, hence that is also the index of the correct tube id.
    """
    index = np.searchsorted(stv_np[:,-1], av_dict[key])
    rows, columns = stv_np.shape
    
    """Final check if the index is 'out of bounds', i.e. there are no tubes that can fit the assay requirements."""
    if index >= rows:
      assays.update({key:"no tube"})
      continue

    """Update the 'assay' dictionary with the new keys/values, and then delete the row from the ndarray."""
    assays.update({key:stv_np[index,1]})
    stv_np = np.delete(stv_np, (index), axis=0)
    
  return {sample_id: assays}

In [None]:
"""Can uncomment and use 'df' and 'av_dict' as the sample-tube-volume dataframe and the assay volume dictionary (respectively)."""
df = pd.read_csv("sample_data_(1).csv")
av_dict =  {'A':30, 'B':600, 'C':1000, 'D':1000, 'E':1000}
tube_selector(df,av_dict)  

{68499: {'A': 269259, 'B': 271268, 'C': 269312, 'D': 269309, 'E': 269219}}