In [1]:
import pandas as pd
from IPython.display import display, Markdown
df = pd.read_csv("/Volumes/LabExMI/Users/Nolwenn/FreezerPro/DataToImport/Supernatants_Derivatives_20161220.csv")
display(Markdown("**%s** tubes in our derivatives file." % len(df)))
display(Markdown("List of the *%d* columns:" % len(df.columns)))
display(Markdown(";\n".join(["1. "+col for col in df.columns])+"."))

**10000** tubes in our derivatives file.

List of the *33* columns:

1. ParentID;
1. Name;
1. BARCODE;
1. Position;
1. Position.1;
1. Volume;
1. Freezer;
1. Freezer_Descr;
1. Level1;
1. Level1_Descr;
1. Level2;
1. Level2_Descr;
1. Level3;
1. Level3_Descr;
1. BoxType;
1. Box;
1. Box_Descr;
1. ThermoBoxBarcode;
1. BOX_BARCODE;
1. CreationDate;
1. UpdateDate;
1. AliquotID;
1. DonorID;
1. StimulusID;
1. StimulusName;
1. VisitID;
1. ThawCycle;
1. Sample Source;
1. Description;
1. BatchID;
1. ShelfBarcode;
1. RackBarcode;
1. DrawerBarcode.

We have 10.000 tubes as expected. We want to know how many StimulusID we have:

In [2]:
countstimperdonor = pd.DataFrame(df.groupby("DonorID")["StimulusID"].count())
display(Markdown("List of **%d** count StimulusID:" % len(countstimperdonor["StimulusID"].unique())))
display(Markdown(";\n".join(["* "+str(int(stim))+" unique StimulusID" for stim in sorted(countstimperdonor["StimulusID"].unique())])+"."))

List of **1** count StimulusID:

* 10 unique StimulusID.

We want to know if at least one donor has less than 10 StimulusID associated, and, if possible, which StimulusID is missing.

In [3]:
donorindexes = countstimperdonor[countstimperdonor["StimulusID"] < 10].index.values
if len(donorindexes) > 0:
    display(Markdown("List of **%d** unique DonorID:" % len(donorindexes)))
    display(Markdown(";\n".join([" - "+str(int(donor)) for donor in sorted(donorindexes)])+"."))
else:
    display(Markdown("All donor are assigned to at least 10 StimulusID."))
    display(Markdown("%d unique DonorID has more thant 10 StimulusID." % \
                     len(countstimperdonor[countstimperdonor["StimulusID"] > 10].index.values)))

All donor are assigned to at least 10 StimulusID.

0 unique DonorID has more thant 10 StimulusID.

In [4]:
stimlist = df["StimulusID"].unique()
stimlistdonor75 = df.loc[df["DonorID"] == 75.0, "StimulusID"].unique()
stimnotfound = list(set(stimlist) - set(stimlistdonor75))
display(Markdown("List of **%d** stimulus not found for donor number 75:" % len(stimnotfound)))
display(Markdown(";\n".join(["* "+str(int(stim)) for stim in sorted(stimnotfound)])+"."))

List of **1** stimulus not found for donor number 75:

* 19.

The donor numbered 75 has no tube for Stimuli 19. We expected to have 10 StimulusID:
1. 11;
1. 17;
1. 18;
1. 23;
1. 24;
1. 27;
1. 32;
1. 35;
1. 37;
1. 39.

We want to know how many donors are assigned to each stimulus to understand why the full list of Stimulus has the extra ID **19**.

In [5]:
countdonorperstim = pd.DataFrame(df.groupby("StimulusID")["DonorID"].count())
countdonorperstim.loc[:, "StimulusID"] = countdonorperstim.index.get_values().astype(int)
countdonorperstim.reset_index(drop=True, inplace=True)
display(countdonorperstim[["StimulusID", "DonorID"]])

Unnamed: 0,StimulusID,DonorID
0,11,1000
1,17,1000
2,18,1000
3,19,1
4,23,999
5,24,1000
6,27,1000
7,32,1000
8,35,1000
9,37,1000


For StimulusID 19, one donor is assigned. First, we want to know the donor assigned to stimulus 19:

In [6]:
getdonor = df.loc[df["StimulusID"] == 19.0, "DonorID"]
display(Markdown("DonorID **%d** is assigned to Stimulus ID 19." % getdonor))

DonorID **819** is assigned to Stimulus ID 19.

For donor *819*, the problem is known. In fact, this donor is supposed to be assigned to stimulus 23 but it seems that stimulus were mixed for this donor:
* in box 23, tube donor 819 found should have been assigned for box 24
* in box 24, tube donor 819 found should have been assigned for box 17
* in box 17, tube donor 819 found should have been assigned for box 18
* in box 18, tube donor 819 found should have been assigned for box 19

(Remember to ask Céline if it is correct)

For stimulus 23, we want to know which are the missing donors:

In [7]:
donorlist = df["DonorID"].unique()
donorliststim23 = df.loc[df["StimulusID"] == 23.0, "DonorID"].unique()
donornotfound = list(set(donorlist) - set(donorliststim23))
display(Markdown("List of **%d** donors not found for StimulusID 23:" % len(donornotfound)))
display(Markdown(";\n".join(["* "+str(int(donor)) for donor in sorted(donornotfound)])))

List of **1** donors not found for StimulusID 23:

* 819

This confirms that the tube of DonorID 819 in StimulusID 19 is probably assigned to the wrong stimulus and should have been assign to the stimulus 23.

Know we want to be sure that we have one stimulus per box:

In [8]:
display(Markdown("There is **%d unique boxes** in the file." % len(df["Box"].unique())))
countboxstim = pd.DataFrame(df.groupby("Box")["StimulusID"].nunique())
countboxstim.loc[:, "Box"] = countboxstim.index.get_values()
countboxstim.reset_index(drop = True, inplace = True)
display(countboxstim.loc[countboxstim["StimulusID"] > 1, ["Box", "StimulusID"]])

There is **110 unique boxes** in the file.

Unnamed: 0,Box,StimulusID
19,MIC_Plasma_S17_V1_A1_F1_D801-896,2
30,MIC_Plasma_S18_V1_A1_F1_D801-896,2
41,MIC_Plasma_S23_V1_A1_F1_D801-896,2
52,MIC_Plasma_S24_V1_A1_F1_D801-896,2


We have 4 boxes for which 2 different StimulusID are assigned, apparently those boxes are in the range of the DonorID 819. What are the StimulusID for each box?

In [9]:
boxes = countboxstim.loc[countboxstim["StimulusID"] > 1, "Box"]
display(Markdown("List of StimulusID for the **%d** boxes to check:" % len(boxes)))
for box in boxes:
    display(Markdown("* "+box+" contains %d StimulusID" % len(df.loc[df["Box"] == box, "StimulusID"].unique())+":"))
    for stim in df.loc[df["Box"] == box, "StimulusID"].unique():
        display(Markdown("       * "+str(int(stim))))

List of StimulusID for the **4** boxes to check:

* MIC_Plasma_S17_V1_A1_F1_D801-896 contains 2 StimulusID:

       * 17

       * 18

* MIC_Plasma_S18_V1_A1_F1_D801-896 contains 2 StimulusID:

       * 18

       * 19

* MIC_Plasma_S23_V1_A1_F1_D801-896 contains 2 StimulusID:

       * 23

       * 24

* MIC_Plasma_S24_V1_A1_F1_D801-896 contains 2 StimulusID:

       * 24

       * 17

The boxes impacted seems to be the ones that Céline already found. Could we retrieve the DonorID 819 in those StimulusID?

In [10]:
getstims = df.loc[df["Box"].isin(boxes), "StimulusID"].astype(int).unique()
for box in boxes:
    for stim in getstims:
        if len(df.loc[(df["Box"] == box) & (df["StimulusID"] == stim) & (df["DonorID"] == 819), "DonorID"]) > 0:
            display(Markdown('* DonorID 819 found in box *%s*, StimulusID **%d**.' % (box,stim)))

* DonorID 819 found in box *MIC_Plasma_S17_V1_A1_F1_D801-896*, StimulusID **18**.

* DonorID 819 found in box *MIC_Plasma_S18_V1_A1_F1_D801-896*, StimulusID **19**.

* DonorID 819 found in box *MIC_Plasma_S23_V1_A1_F1_D801-896*, StimulusID **24**.

* DonorID 819 found in box *MIC_Plasma_S24_V1_A1_F1_D801-896*, StimulusID **17**.

Apparently, the problem described by Céline exists for the 4 boxes, do we have to change them? Normally they are supposed to already been changed. Is there a problem in the script that generated the data?

We also want to be sure that the box name reproduce the same error if we look with the Thermo Fisher box barcode:

In [11]:
display(Markdown("There is **%d unique boxes** in the file." % len(df["ThermoBoxBarcode"].unique())))

countthermoboxstim = pd.DataFrame(df.groupby("ThermoBoxBarcode")["StimulusID"].nunique())
countthermoboxstim.loc[:, "ThermoBoxBarcode"] = countthermoboxstim.index.get_values()
countthermoboxstim.reset_index(drop = True, inplace = True)
display(countthermoboxstim.loc[countthermoboxstim["StimulusID"] > 1, ["ThermoBoxBarcode", "StimulusID"]])

There is **110 unique boxes** in the file.

Unnamed: 0,ThermoBoxBarcode,StimulusID
35,TF00080640,2
75,TS00010678,2
95,TS00047751,2
103,TS00048039,2


When we looked for Box column, we had 117 unique boxes. When we look for ThermoBoxBarcode, we have 111 boxes. We will be interested to know how many Box each ThermoBoxBarcode embed.
We wonder if the same trouble as for Box column, with StimulusID, occurs with the column ThermoBoxBarcode:

In [12]:
thermoboxes = countthermoboxstim.loc[countthermoboxstim["StimulusID"] > 1, "ThermoBoxBarcode"]
display(Markdown("List of StimulusID for the **%d** boxes to check:" % len(thermoboxes)))

for thermobox in thermoboxes:
    display(Markdown("* "+thermobox+" contains %d StimulusID" % \
                     len(df.loc[df["ThermoBoxBarcode"] == thermobox, "StimulusID"].unique())+":"))
    for stim in df.loc[df["ThermoBoxBarcode"] == thermobox, "StimulusID"].unique():
        display(Markdown("       *"+str(int(stim))))

List of StimulusID for the **4** boxes to check:

* TF00080640 contains 2 StimulusID:

       *18

       *19

* TS00010678 contains 2 StimulusID:

       *17

       *18

* TS00047751 contains 2 StimulusID:

       *23

       *24

* TS00048039 contains 2 StimulusID:

       *24

       *17

The same lists of StimulusID appears for ThermoBoxBarcode column. Do we have the same result on StimulusID column when we look specifically for DonorID 819?

In [13]:
getstims = df.loc[df["ThermoBoxBarcode"].isin(thermoboxes), "StimulusID"].astype(int).unique()
for thermobox in thermoboxes:
    for stim in getstims:
        if len(df.loc[(df["ThermoBoxBarcode"] == thermobox) & (df["StimulusID"] == stim) & \
                      (df["DonorID"] == 819), "DonorID"]) > 0:
            display(Markdown('* DonorID 819 found in box *%s*, StimulusID **%d**.' % (thermobox,stim)))

* DonorID 819 found in box *TF00080640*, StimulusID **19**.

* DonorID 819 found in box *TS00010678*, StimulusID **18**.

* DonorID 819 found in box *TS00047751*, StimulusID **24**.

* DonorID 819 found in box *TS00048039*, StimulusID **17**.

The results are in accordance with analysis using Box column.

We want to know, from the 4 boxes from Thermo that are assigned to more than one StimulusID, the list of the boxes Box that are related:

In [14]:
for thermobox in thermoboxes:
    display(Markdown(thermobox+" -> "+\
                     ", ".join([box for box in df.loc[df["ThermoBoxBarcode"] == thermobox, "Box"].unique()])))

TF00080640 -> MIC_Plasma_S18_V1_A1_F1_D801-896

TS00010678 -> MIC_Plasma_S17_V1_A1_F1_D801-896

TS00047751 -> MIC_Plasma_S23_V1_A1_F1_D801-896

TS00048039 -> MIC_Plasma_S24_V1_A1_F1_D801-896

For the boxes assigned to more than one StimulusID, it looks ok.

For each ThermoBoxBarcode column, which are those with more than one Box column associated?

In [15]:
countboxperthermobox = pd.DataFrame(df.groupby("Box")["ThermoBoxBarcode"].nunique())
countboxperthermobox.loc[:, "Box"] = countboxperthermobox.index.get_values()
countboxperthermobox.reset_index(drop=True, inplace=True)
display(Markdown("**%d** boxes are not assigned to a ThermoBoxBarcode" % \
                 len(countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] < 1,\
                                              ["Box", "ThermoBoxBarcode"]])))
display(Markdown("**%d** boxes are assigned to more than one ThermoBoxBarcode" % \
                 len(countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] > 1,\
                                              ["Box", "ThermoBoxBarcode"]])))
display(Markdown("**%d** boxes are assigned to a ThermoBoxBarcode" % \
                 len(countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] == 1,\
                                              ["Box", "ThermoBoxBarcode"]])))
if len(countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] != 1, \
                                ["Box", "ThermoBoxBarcode"]]) > 0:
    display(countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] != 1,["Box", "ThermoBoxBarcode"]])

**0** boxes are not assigned to a ThermoBoxBarcode

**0** boxes are assigned to more than one ThermoBoxBarcode

**110** boxes are assigned to a ThermoBoxBarcode

Do these boxes contains info on DonorID?

In [16]:
boxes = countboxperthermobox.loc[countboxperthermobox["ThermoBoxBarcode"] != 1]["Box"].values
display(Markdown("**%d** tubes in boxes not assigned to ThermoBoxbarcode." % \
      len(df.loc[df["Box"].isin(boxes), ["Box", "ThermoBoxBarcode", "DonorID"]])))
if len(df.loc[df["Box"].isin(boxes), ["Box", "ThermoBoxBarcode", "DonorID"]]) > 0:
    display(df.loc[df["Box"].isin(boxes), ["Box", "ThermoBoxBarcode", "DonorID"]])

**0** tubes in boxes not assigned to ThermoBoxbarcode.

The list of DonorID found corresponded to tubes that generated errors and that were corrected by Céline. The column ThermoBoxBarcode was not taken into account on the previous version of the script, know it is fixed.

We also want to check if none of the excluded donors are in our data:

In [17]:
excludeddonors = [96, 104, 122, 167, 178, 219, 268, 279, 303, 308, 534, 701]
df["DonorID"] = df["DonorID"].astype(int)
display(Markdown("**%d** donor found." % len(df.loc[df["DonorID"].isin(excludeddonors), "DonorID"].unique())))
if len(df.loc[df["DonorID"].isin(excludeddonors), "DonorID"].unique()) > 0:
    display(Markdown("The excluded donor found are:"))
    display(Markdown(";\n".join(["* "+str(donor) for donor in df.loc[df["DonorID"].isin(excludeddonors), "DonorID"].unique()])))

**0** donor found.

We have none of the excluded donors in the final output of our data.

From Céline, the missing donor for StimulusID should be in the run file, and it should be assign to Thermo Fisher box barcode TF00080651, at position G3. From the run file, there is a tube. We have to check the donors from the ThermoBoxBarcode, at well row G. From Atlas, in the run excel file, the tube is set as ‘No read‘, but from the computer next to TECAN, the barcode exists. After adding the good info, we have those data:

In [18]:
display(Markdown("**%d** tubes found for ThermoBoxBarcode TF00080651:" % len(df.loc[(df["ThermoBoxBarcode"] == "TF00080651") & (df["Position"].str.contains("G")),\
               ["Box", "ThermoBoxBarcode", "Position", "DonorID"]].sort_values(["DonorID"]))))
display(df.loc[(df["ThermoBoxBarcode"] == "TF00080651") & (df["Position"].str.contains("G")),\
               ["Box", "ThermoBoxBarcode", "Position", "DonorID"]].sort_values(["DonorID"]))

**12** tubes found for ThermoBoxBarcode TF00080651:

Unnamed: 0,Box,ThermoBoxBarcode,Position,DonorID
3017,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 1,73
3012,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 2,74
3019,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 3,75
2988,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 4,76
3074,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 5,77
3029,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 6,78
3030,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 7,79
3060,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 8,80
3004,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 9,81
3051,MIC_Plasma_S23_V1_A1_F1_D1-96,TF00080651,G / 10,82


We are supposed to have 11 boxes per stimulus, how many boxes do we effectively have per StimulusID?

In [19]:
countboxperstim = pd.DataFrame(df.groupby("StimulusID")["Box"].nunique())
countboxperstim.loc[:, "StimulusID"] = countboxperstim.index.get_values()
countboxperstim.reset_index(drop = True, inplace = True)
display(countboxperstim[["Box", "StimulusID"]])

Unnamed: 0,Box,StimulusID
0,11,11.0
1,12,17.0
2,12,18.0
3,1,19.0
4,11,23.0
5,12,24.0
6,11,27.0
7,11,32.0
8,11,35.0
9,11,37.0


The exceed box for StimulusID 19 is expected as we still don't have change DonorID 819 for this stimulus. The boxes in StimulusID 17, 18, and 24 are also expected. What are these boxes?

In [20]:
stims = countboxperstim.loc[countboxperstim["Box"] > 11, "StimulusID"].values.tolist()

if len(df.loc[df["StimulusID"].isin(stims), "Box"].unique()) > 11:
    for stim in stims:
        display(Markdown("List of boxes for **StimulusID %d**:" % stim))
        display(Markdown(";\n".join(["* "+str(box)+", StimulusID "+str(int(stim)) for box in df.loc[df["StimulusID"] == stim, "Box"].unique()])+"."))

List of boxes for **StimulusID 17**:

* MIC_Plasma_S17_V1_A1_F1_D1-96, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D101-196, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D201-296, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D301-396, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D401-496, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D501-596, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D601-696, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D701-796, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D801-896, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_D901-996, StimulusID 17;
* MIC_Plasma_S17_V1_A1_F1_DX97-X00, StimulusID 17;
* MIC_Plasma_S24_V1_A1_F1_D801-896, StimulusID 17.

List of boxes for **StimulusID 18**:

* MIC_Plasma_S17_V1_A1_F1_D801-896, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D1-96, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D101-196, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D201-296, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D301-396, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D401-496, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D501-596, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D601-696, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D701-796, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D801-896, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_D901-996, StimulusID 18;
* MIC_Plasma_S18_V1_A1_F1_DX97-X00, StimulusID 18.

List of boxes for **StimulusID 24**:

* MIC_Plasma_S23_V1_A1_F1_D801-896, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D1-96, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D101-196, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D201-296, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D301-396, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D401-496, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D501-596, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D601-696, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D701-796, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D801-896, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_D901-996, StimulusID 24;
* MIC_Plasma_S24_V1_A1_F1_DX97-X00, StimulusID 24.

List of donors in unexpected boxes:

In [25]:
boxes = ['MIC_Plasma_S24_V1_A1_F1_D801-896', 'MIC_Plasma_S17_V1_A1_F1_D801-896', 'MIC_Plasma_S23_V1_A1_F1_D801-896']
display(df.loc[(df["Box"] == 'MIC_Plasma_S17_V1_A1_F1_D801-896') & (df["StimulusID"] != 17.0),\
               ["Box", "StimulusID", "DonorID"]])
display(df.loc[(df["Box"] == 'MIC_Plasma_S18_V1_A1_F1_D801-896') & (df["StimulusID"] != 18.0),\
               ["Box", "StimulusID", "DonorID"]])
display(df.loc[(df["Box"] == 'MIC_Plasma_S24_V1_A1_F1_D801-896') & (df["StimulusID"] != 24.0),\
               ["Box", "StimulusID", "DonorID"]])
display(df.loc[(df["Box"] == 'MIC_Plasma_S23_V1_A1_F1_D801-896') & (df["StimulusID"] != 23.0),\
               ["Box", "StimulusID", "DonorID"]])

Unnamed: 0,Box,StimulusID,DonorID
1837,MIC_Plasma_S17_V1_A1_F1_D801-896,18.0,819


Unnamed: 0,Box,StimulusID,DonorID
2775,MIC_Plasma_S18_V1_A1_F1_D801-896,19.0,819


Unnamed: 0,Box,StimulusID,DonorID
4775,MIC_Plasma_S24_V1_A1_F1_D801-896,17.0,819


Unnamed: 0,Box,StimulusID,DonorID
3821,MIC_Plasma_S23_V1_A1_F1_D801-896,24.0,819


These results are agreed with observations done by Céline. These tubes will have to be changed manually, directly from the output file.