# GMM Test Data Mockup Notebook

This notebook will produce the test files for GMM dashboard, all output files are stored in tsv format.

For more information please refer to https://github.com/WEHI-ResearchComputing/Genomics-Metadata-Multiplexing/tree/shiny-r-dev/test.

## Operation 1: Generate Sample Sheet based on given Plate Layout Template

In [1]:
from operations import *

In [2]:
# generate sample sheet from based on provided plate_spreadsheet_template.xlsx
sample_sheet_df = plate_to_samplesheet('./plate_spreadsheet_template.xlsx')

(
    sample_sheet_df.head(),
    sample_sheet_df.to_csv('op1.plate_layout_to_spreadsheet.tsv', sep='\t', index=False)
)

(    plate well_position            sample
 0  LCE123            A1  Test before sort
 1  LCE123            A2   Test after sort
 2  LCE123            A3           SampleA
 3  LCE123            A4           SampleA
 4  LCE123            A5           SampleA,
 None)

## Operation 2: Collate Multiple FCS files into a single FCS file

In [3]:
def collate_fcs_files(fcs_files, upload_dir):
    '''
    Collate FCS files into a single dataframe, adding columns for plate and sample name.
    '''
    fcs_data = pd.DataFrame()

    for fcs_file in fcs_files:
        # fcs_savepath = os.path.join(upload_dir, fcs_file.filename)
        fcs_savepath = fcs_file  # alter this function for mock up test result purpose
        meta, data = fcsparser.parse(fcs_savepath, meta_data_only=False, reformat_meta=True)
        data = data.sort_values('Time')
        data['well_position'] = get_well_positions(meta)

        plate, sample = get_plate_and_sample_from_filepath(fcs_savepath)
        data['plate'] = plate
        data['sample'] = sample

        fcs_data = pd.concat([fcs_data, data])

    return fcs_data

In [4]:
# collate multiple fcs files into a single tsv output
collated_fcs_df = collate_fcs_files(['./14Jun23_INX_NKC_084_LCE662.fcs'], "")  # provide a list of fcs files

# mock up plate data for LCE123
collated_fcs_df['plate'] = 'LCE123'

(
    collated_fcs_df.head(),
    collated_fcs_df.to_csv('./op2.collate_fcs_files.tsv', sep='\t', index=False)
)

(           FSC-A    FSC-H          SSC-A    SSC-H    CD16 FITC       CD56 PE  \
 0  115089.296875  69562.0   75108.632812  44791.0   129.710007  37108.621094   
 1  127671.296875  83696.0   64815.761719  43674.0   112.270004  17709.240234   
 2  102016.796875  74176.0   50605.429688  35155.0   105.730003  25967.160156   
 3  123290.093750  71681.0  100057.640625  55735.0   553.720032     66.299995   
 4  110112.296875  68001.0   86295.304688  55742.0  1148.859985    149.940002   
 
          DAPI         Time well_position   plate   sample  
 0   61.410000  1698.400024            A3  LCE123  NKC_084  
 1   52.509998  1865.300049            A4  LCE123  NKC_084  
 2   41.829998  2052.899902            A5  LCE123  NKC_084  
 3  203.809998  2330.300049            A6  LCE123  NKC_084  
 4   97.900002  2758.300049            A7  LCE123  NKC_084  ,
 None)

## Operation 3: Merge Data with Sample Sheet and Template Sheet

In [5]:
merged_samplesheet_fcs_and_template_sheet_df = merge_data_with_samplesheet('./op1.plate_layout_to_spreadsheet.tsv', './op2.collate_fcs_files.tsv', './template_sheet.xlsx')

(
    merged_samplesheet_fcs_and_template_sheet_df.head(),
    merged_samplesheet_fcs_and_template_sheet_df.to_csv('op3.merged_sample_sheet.tsv', index=False, sep='\t')
)

(   Plate# Well position Sample type\n(SC or MB)  Tissue type\n(if required)  \
 0  LCE123            A1                   empty                         NaN   
 1  LCE123            A2                   empty                         NaN   
 2  LCE123            A3                     NaN                         NaN   
 3  LCE123            A4                     NaN                         NaN   
 4  LCE123            A5                     NaN                         NaN   
 
   Sample name  FACs gate\n(if required) Primer name  \
 0     removed                       NaN     removed   
 1     removed                       NaN     removed   
 2         NaN                       NaN          99   
 3         NaN                       NaN         100   
 4         NaN                       NaN         101   
 
   index sequence \n(as in C-RT1-primer) (separate index read)  \
 0                               removed               removed   
 1                               removed        

## Operation 4: Merge Primer Index File with Operation 3 Result (if provide Primer Index File)

In [6]:
# load primer index template
primer_index_df = pd.read_excel('primer_index_template.xlsx', sheet_name='Sample primer & index', skiprows=3)

# mockup primer index data by setting plate# to LCE123, mockup sample data based on operation 1
primer_index_df['Plate#'] = 'LCE123'
primer_index_df['Sample name'] = sample_sheet_df['sample']

# generate mockup test_primer_index.tsv file
primer_index_df.to_csv('test_primer_index.tsv', sep='\t', index=False)

# generate mockup test result file
merged_primer_index_df = pd.merge(merged_samplesheet_fcs_and_template_sheet_df, primer_index_df, 
                                  left_on=['Plate#', 'Well position', 'sample'], 
                                  right_on=['Plate#', 'Well position', 'Sample name'],
                                  suffixes=('', '_primer'))

(
    merged_primer_index_df.head(),
    merged_primer_index_df.to_csv('op4.merged_primer_index.tsv', sep='\t', index=False)
)

(   Plate# Well position Sample type\n(SC or MB)  Tissue type\n(if required)  \
 0  LCE123            A1                   empty                         NaN   
 1  LCE123            A2                   empty                         NaN   
 2  LCE123            A3                     NaN                         NaN   
 3  LCE123            A4                     NaN                         NaN   
 4  LCE123            A5                     NaN                         NaN   
 
   Sample name  FACs gate\n(if required) Primer name  \
 0     removed                       NaN     removed   
 1     removed                       NaN     removed   
 2         NaN                       NaN          99   
 3         NaN                       NaN         100   
 4         NaN                       NaN         101   
 
   index sequence \n(as in C-RT1-primer) (separate index read)  \
 0                               removed               removed   
 1                               removed        