In [1]:
import glob
import os
import fcsparser
import pandas as pd
import re

directory_path = "/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue"
plates = ['LCE660', 'LCE661', 'LCE662', 'LCE663',
          'LCE664', 'LCE666', 'LCE667', 'LCE668',
          'LCE669', 'LCE793', 'LCE794', 'LCE795',
          'LCE796', 'LCE797', 'LCE798', 'LCE799',
          'LCE800', 'LCE801', 'LCE802', 'LCE803',
          'LCE804']

fcs_files = []

# Loop through each pattern
for plate in plates:
    pattern = f"*{plate}*.fcs"
    full_pattern = os.path.join(directory_path, "**", pattern)
    
    fcs_files.extend(glob.glob(full_pattern, recursive=True))

print(fcs_files)

['/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue24May23/24May23_INX_NKC 079_LCE660.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue24May23/24May23_INX_NKC REF-1_LCE660.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue24May23/24May23_INX_NKC REF-1_LCE661.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue24May23/24May23_INX_NKC 079_LCE661.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue14Jun23/14Jun23_INX_NKC 084_LCE662.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue14Jun23/14Jun23_INX_Ref Ctrl_LCE662.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue14Jun23/14Jun23_INX_Ref Ctrl_LCE663.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghue14Jun23/14Jun23_INX_NKC 084_LCE663.fcs', '/Volumes/score/CytometryFiles/CelSeq_IndexFiles/G000287_Donoghue/Donoghu

In [2]:
def get_well_positions(meta):
    '''
    Extract well positions (e.g., A1, A2 etc.) from FCS file metadata,
    ordered by sorting locations.   
    '''
    metalist = list(meta.keys())
    locnames = [item for item in metalist if item.startswith('INDEX SORTING LOCATIONS')]
    locnames = sorted(locnames, key=lambda s: int(s.split('_')[1]))

    wells = []
    for locname in locnames:
        loclist = meta[locname]
        locs = loclist.split(";")

        for loc in locs:
            if loc:
                x, y = loc.split(",")
                new_x = chr(int(x) + ord('A'))
                new_y = str(int(y) + 1)
                wells.append(new_x + new_y)

    return wells

def rename_sample(sample_name):
    '''
    Match "Ref Ctrl <digit>" or "NKC REF-<digit>" and rename
    to "NKC ref <digit> control". Also match "NKC <digit>"
    and remove space between NKC and number.
    '''
    match_nkc = re.match(r'(NKC)\s+(\d)', sample_name)
    match_ctrl = re.match(r'(Ref Ctrl\s*(\d*)|NKC REF-*(\d*))', sample_name, re.IGNORECASE)
    if match_nkc:
        return re.sub(r'(NKC)\s+(\d)', r'\1\2', sample_name)
    elif match_ctrl:
        # If a digit was found, use it, otherwise use 1
        digit = match_ctrl.group(2) or match_ctrl.group(3) or '1'
        return f'NKC ref {digit} control'
    else:
        # If the sample name doesn't match the pattern, return it unchanged
        return sample_name

def get_plate_and_sample_from_filepath(fcs_filepath):
    '''
    Extract plate and sample name from file names in format
    e.g., DDmonthYY_INX_samplename_platename.fcs where plate
    name starts with LCE.
    '''
    filename = os.path.basename(fcs_file)
    filename = os.path.splitext(filename)[0]

    # Extract the plate name
    plate_start_index = filename.find('LCE')
    if plate_start_index != -1:
        plate = filename[plate_start_index:]

    # Extract the sample name
    sample_start_index = filename.find('INX_')
    if sample_start_index != -1:
        sample_name = filename[sample_start_index + len('INX_'):(plate_start_index - 1)]

    # clean up the sample name by removing space between NKC and number
    sample_name = rename_sample(sample_name)

    return plate, sample_name

In [3]:
fcs_data = pd.DataFrame()

for fcs_file in fcs_files:
    meta, data = fcsparser.parse(fcs_file, meta_data_only=False, reformat_meta=True)
    data = data.sort_values('Time')
    data['well_position'] = get_well_positions(meta)

    plate, sample = get_plate_and_sample_from_filepath(fcs_file)
    data['plate'] = plate
    data['sample'] = sample

    fcs_data = pd.concat([fcs_data, data])

fcs_data.head()

Unnamed: 0,FSC-A,FSC-H,SSC-A,SSC-H,CD16 FITC,CD56 PE,PI,Time,well_position,plate,sample,DAPI
0,66263.171875,57108.0,32199.511719,28850.0,82.390007,15556.860352,14887.620117,652.599976,A3,LCE660,NKC079,
1,78225.65625,63448.0,26731.810547,21565.0,48.150002,11841.389648,11141.459961,842.099976,A4,LCE660,NKC079,
2,69945.101562,58184.0,29535.210938,25270.0,28.890001,10533.600586,9749.520508,1024.099976,A5,LCE660,NKC079,
3,144794.984375,91762.0,85041.460938,52622.0,4847.100098,124.739998,448.470001,2347.100098,A6,LCE660,NKC079,
4,137626.03125,81047.0,84660.546875,47887.0,3654.050293,166.320007,344.519989,3364.399902,A7,LCE660,NKC079,


In [4]:
fcs_data.to_csv(os.path.expanduser('~/Desktop/celseq_spreadsheets/fcs_data.tsv'), index=False, sep='\t')
fcs_data[['plate', 'sample']].drop_duplicates()

Unnamed: 0,plate,sample
0,LCE660,NKC079
0,LCE660,NKC ref 1 control
0,LCE661,NKC ref 1 control
0,LCE661,NKC079
0,LCE662,NKC084
0,LCE662,NKC ref 1 control
0,LCE663,NKC ref 1 control
0,LCE663,NKC084
0,LCE664,NKC084
0,LCE664,NKC ref 1 control


In [7]:
filepath = os.path.expanduser('~/Desktop/celseq_spreadsheets/G287_JacquiDonaghue_SC.MB_S000454_SeqprimerNov23.xlsx')
sheet = pd.read_excel(filepath, sheet_name='Samples', skiprows=3)
sheet['Well position'] = sheet['Well position'].str.split('=').str[0] # remove '=' from well positions
sheet.head()

Unnamed: 0,Plate#,Well position,Sample type,Sample,NK cell popn,Primer name,index sequence \n(as in C-RT1-primer),(separate index read),(separate index read).1,RT1 index primer sequences
0,LCE660,A1,removed,removed,removed,removed,removed,removed,removed,HPR control
1,LCE660,A2,removed,removed,removed,removed,removed,removed,removed,HPR control
2,LCE660,A3,Single cell,NKC079,"P4_CD56+, CD16-",99,GGTCTATG,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...
3,LCE660,A4,Single cell,NKC079,"P4_CD56+, CD16-",100,GTCCGAAT,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...
4,LCE660,A5,Single cell,NKC079,"P4_CD56+, CD16-",101,TAGTGCGT,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...


In [8]:
# Merge the two dataframes
merged = pd.merge(sheet, fcs_data, left_on=['Plate#', 'Sample', 'Well position'], right_on=['plate', 'sample', 'well_position'], how='left')
merged.to_csv(os.path.expanduser('~/Desktop/celseq_spreadsheets/merged.tsv'), index=False, sep='\t')
merged.head()

Unnamed: 0,Plate#,Well position,Sample type,Sample,NK cell popn,Primer name,index sequence \n(as in C-RT1-primer),(separate index read),(separate index read).1,RT1 index primer sequences,...,SSC-A,SSC-H,CD16 FITC,CD56 PE,PI,Time,well_position,plate,sample,DAPI
0,LCE660,A1,removed,removed,removed,removed,removed,removed,removed,HPR control,...,,,,,,,,,,
1,LCE660,A2,removed,removed,removed,removed,removed,removed,removed,HPR control,...,,,,,,,,,,
2,LCE660,A3,Single cell,NKC079,"P4_CD56+, CD16-",99,GGTCTATG,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...,...,32199.511719,28850.0,82.390007,15556.860352,14887.620117,652.599976,A3,LCE660,NKC079,
3,LCE660,A4,Single cell,NKC079,"P4_CD56+, CD16-",100,GTCCGAAT,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...,...,26731.810547,21565.0,48.150002,11841.389648,11141.459961,842.099976,A4,LCE660,NKC079,
4,LCE660,A5,Single cell,NKC079,"P4_CD56+, CD16-",101,TAGTGCGT,RPI 1,ATCACG,CGATTGAGGCCGGTAATACGACTCACTATAGGGGTTCAGAGTTCTA...,...,29535.210938,25270.0,28.890001,10533.600586,9749.520508,1024.099976,A5,LCE660,NKC079,
