# Week 4 - Patient Data Example

Use case: You have files with data for multiple patients (1 file per patient) and you want to combine the data into one file.

In [62]:
from glob import glob
from csv import DictReader

In [63]:
patientIDs = ["01001", "01010", "01020"]
dirpath = "./data/tmp/"

In [65]:
# Loop through the patient ID
for id in patientIDs:
    # Generate the search path
    # Note that "*" stand for a wildcard (AKA anything)
    path = "".join([dirpath, "*", id, "*"])
#     print(path) # The "search" term we are using for glob
    fns = glob(path)
#     print(fns) # Prints the full path name to the files
    
    data = dict() # Initialise an empty dictionary
    
    # Loop through the files relevant to each patient
    for fn in fns:
        print("Filename:", fn)
        with open(fn, "r") as infile:
            fh = DictReader(infile)
            tmp = next(fh) # Grab the first line to help initial the dictionary
            for k,v in tmp.items():
                data.update({k: [v]})
            # Iterate through the rest of the file and add in the values to the corresponding key
            for line in fh:
                for k,v in line.items():
                    data[k].append(v)
#     print(data)
    keep = list() # Initialise an empty list for the indexes of the rows we want to keep
    for i in range(len(data["gene"])):
        if (data["gene"][i] == "PIK3CA" and float(data["vaf"][i]) > 5):
            keep.append(i)
        elif (data["gene"][i] == "ESR1" and float(data["vaf"][i]) > 2):
            keep.append(i)
        elif (float(data["vaf"][i]) > 10):
            keep.append(i)
    
    # Initialise an empty dictionary to store the results
    results = dict()
    # Note here the key is the column name and the value is a list
    for key, value in data.items(): # Iterate through each key, value
        for i in keep: # Iterate through the list of indexes that we want to keep
            if key not in results: # Check if the column name is in the dictionary, if not:
                results[key] = [value[i]] # Initialise the column name as a key and set the value to be a list of the value
            else:
                results[key].append(value[i]) # If the column name is already in the dictionary then add to the list
    
    # Write out the results dictionary to a file
    with open("".join(["patient_", id, "_allFiltered.csv"]), "w") as outfile:
        outfile.write(",".join(["patientID", "cycle", "gene", "vaf"]) + "\n") # Write out the header
        lines = list(zip(results["patientID"], results["cycle"], results["gene"], results["vaf"])) # See example below for how zip works
        # Write out each row to the output file
        for line in lines:
            outfile.write(",".join(list(line)) + "\n")
                

./data/tmp/*01001*
['./data/tmp/patient_01001_C10D1.csv', './data/tmp/patient_01001_C1D1.csv', './data/tmp/patient_01001_C1D15.csv', './data/tmp/patient_01001_C2D1.csv', './data/tmp/patient_01001_C3D1.csv', './data/tmp/patient_01001_C4D1.csv', './data/tmp/patient_01001_C5D1.csv', './data/tmp/patient_01001_C6D1.csv', './data/tmp/patient_01001_C7D1.csv', './data/tmp/patient_01001_C8D1.csv', './data/tmp/patient_01001_C9D1.csv', './data/tmp/patient_01001_EOT.csv', './data/tmp/patient_01001_MNL.csv']
./data/tmp/*01010*
['./data/tmp/patient_01010_C1D1.csv', './data/tmp/patient_01010_C1D15.csv', './data/tmp/patient_01010_C2D1.csv', './data/tmp/patient_01010_C3D1.csv', './data/tmp/patient_01010_C4D1.csv', './data/tmp/patient_01010_C5D1.csv', './data/tmp/patient_01010_MNL.csv']
./data/tmp/*01020*
['./data/tmp/patient_01020_C1D1.csv', './data/tmp/patient_01020_C7D1.csv', './data/tmp/patient_01020_MNL.csv']


In [61]:
list(zip([1,2,3], [5,6,7,8]))

list(zip(["a", "b", "c"], ["t", "s", "v"]))

[('a', 't'), ('b', 's'), ('c', 'v')]