In [94]:
import pandas as pd
import pathlib
import re
import numpy as np

In [95]:
def find_training_frames(traingset_path, training_plate, training_well_num) -> list:
    frames = []

    with open(traingset_path) as labels_file:
        for line in labels_file:
            if ".tif" in line:  # look at lines with plates/wells
                image_details = line.split("--")
                plate = image_details[0]
                well_num = int(image_details[1].replace("W0", ""))
                # time of frame in minutes (frames captured 30 min apart)
                time = int(image_details[3].replace("T", ""))
                frame = int(time / 30 + 1)

                if plate == training_plate and well_num == training_well_num:
                    frames.append(str(frame))

    return frames


# Read plates listed in features dataset to figure out which wells from which plates have labeled data
# Save these training locations into a file
def save_training_wells(traingset_path, annotations_path, save_path):
    annotations = pd.read_csv(annotations_path, low_memory=False)
    # remove annotations for plates that are missing
    annotations = annotations.loc[annotations["Plate Issues"] != "plate missing"]
    training_data = pd.DataFrame()

    with open(traingset_path) as labels_file:
        for line in labels_file:
            if ".tif" in line:  # look at lines with plates/wells
                image_details = line.split("--")
                plate = image_details[0]
                well_num = int(image_details[1].replace("W0", ""))
                # time of frame in minutes (frames captured 30 min apart)
                time = int(image_details[3].replace("T", ""))
                frame = int(time / 30 + 1)

                frames = find_training_frames(traingset_path, plate, well_num)
                frames = ",".join(frames)
                image_annotations = annotations.loc[
                    (plate == annotations["Plate"])
                    & (annotations["Well Number"] == well_num)
                ]
                try:
                    gene = image_annotations.iloc[0]["Original Gene Target"]
                except IndexError:
                    print(f"Image from {plate}, {well_num} not in IDR")

                frame_details = pd.DataFrame(
                    {
                        "Plate": [plate],
                        "Well Number": [well_num],
                        "Frames": [frames],
                        "Original Gene Target": [gene],
                    }
                )

                if training_data.empty:
                    training_data = frame_details
                else:
                    # see if this well has already been added to training data
                    if not (
                        plate in training_data["Plate"].unique()
                        and well_num in training_data["Well Number"].unique()
                    ):
                        training_data = pd.concat([training_data, frame_details])

    # negative controls correspond to nan genes
    training_data["Original Gene Target"] = training_data[
        "Original Gene Target"
    ].replace(np.NaN, "negative control")
    training_data.to_csv(save_path, sep="\t")
    return training_data


features_path = "trainingset_2007_06_21.dat"
save_path = "training_locations.tsv"
annotations_path = "idr0013-screenA-annotation.csv.gz"
training_data = save_training_wells(features_path, annotations_path, save_path)
training_data

Image from LT0002_17, 36 not in IDR
Image from LT0155_02, 24 not in IDR
Image from LT0155_02, 24 not in IDR
Image from LT0078_27, 244 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0155_02, 24 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0155_02, 24 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0078_27, 244 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0078_27, 244 not in IDR
Image from LT0002_17, 36 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0111_45, 53 not in IDR
Image from LT0155_02, 24 not in IDR
Image from LT0002_17, 36 not in IDR


Unnamed: 0,Plate,Well Number,Frames,Original Gene Target
0,LT0098_13,21,77212733,ENSG00000110675
0,LT0132_31,53,337537425163,ENSG00000123416
0,LT0093_17,114,45724688837987,ENSG00000174442
0,LT0094_04,319,76661897359330,ENSG00000177426
0,LT0002_17,36,5741332870275236,ENSG00000177426
0,LT0026_22,258,8537259567,LCK
0,LT0155_02,24,6951768793,LCK
0,LT0157_04,5,471778261,ENSG00000175216
0,LT0038_27,250,8,BUB1B
0,LT0084_46,3,46809156,ENSG00000138180
