<a href="https://colab.research.google.com/github/Small-Molecule-Discovery-Center/smdc_preprocess_data/blob/main/preprocess_pampa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess PAMPA data
### How to use this notebook:
1. File --> save a copy in drive; open that copy to interact with notebook
1. Click folder icon on left side
2. Drag PAMPA file into folder space
3. Right click on PAMPA file and hit copy path
4. Paste file path into second cell
5. Edit source_plate variable if necessary
3. Run all the cells (shift+enter or click the arrows)
7. DOWNLOAD the results_processed file - it will not get saved to your system it will just be deleted if you close the tab

In [None]:
import pandas as pd
import numpy as np
import os

### New variables to fill out for each plate run

In [None]:
# path to file
infile="/content/20230427_ZA_Plate1_03_Results_processed.csv"

# source plate barcode: if multiple, separate with semicolons like "OTPARP1-MA01;OTPARP2-MA01"
source_plate="OTPARP1-MA01"

# concentration in uM
donor_well_conc=50

### Old code collapsed into one cell

In [None]:
destination_plate=infile.split('/')[-1].replace('_results.xlsx','').replace('_results.xls','')..replace('_Results.xlsx','').replace('_Results.xls').replace('_Results_processed.csv','')
print('Destination plate:', destination_plate)

# path to what it will be saved as
outfile=infile.replace('.xlsx','_processed.csv').replace('.xls','_processed.csv')
print(outfile)

# use pandas to read file
try:
  pampa=pd.read_excel(infile, header=1)
except:
  pampa=pd.read_csv(infile, index_col=0)
# get ride of empty rows
pampa=pampa[~pampa.pI.isna()]
# reset the index column for ease of next steps
pampa=pampa.reset_index(drop=True)
# fill the rows from Sample column that are empty with 'nolab'
pampa.Sample=pampa.Sample.fillna('nolab')
# gut check - how many samples were measured - including 'nolab'?
print('Unique samples pre:', pampa.Sample.nunique())


# loop through each row and fill it in

# initialize sample variable as None
sample=None
# go through each row
for i, row in pampa.iterrows():
    # if Sample column is NOT nolab, ie has a real label,
    # set the sample variable to be what Sample is for that row
    if row.Sample!='nolab':
        sample=row.Sample
    # otherwise, set the Sample column for that row to what the sample variable is
    # in this case this will replace 'nolab' with the most recent previous true sample label
    else:
        pampa.loc[i,'Sample']=sample

# gutcheck - is the number of samples 1 less than before, since all the 'nolabs' are gone?
print('Unique samples post:', pampa.Sample.nunique())

# look at new data
pampa.head(12)

Destination plate: 20230427_ZA_Plate1_03
/content/20230427_ZA_Plate1_03_Results_processed.csv
Unique samples pre: 32
Unique samples post: 32


Unnamed: 0,Sample,Pe Well,P(10-6cm/s),-logPe,pI,BCS code,Comment,pH
0,DMSO,A1,undetected,,0.0,UND,,7.4
1,DMSO,A2,undetected,,0.0,UND,,7.4
2,DMSO,A3,undetected,,0.0,UND,,7.4
3,813048,A4,undetected,,0.0,UND,,7.4
4,813048,A5,undetected,,0.0,UND,,7.4
5,813048,A6,undetected,,0.0,UND,,7.4
6,1084034,A7,51.67,4.287,3.0,HIGH,,7.4
7,1084034,A8,39.311,4.405,3.0,HIGH,,7.4
8,1084034,A9,equilibrated,,1.0,HIGH,,7.4
9,1084039,A10,undetected,,0.0,UND,,7.4


### New code explained

In [None]:
# rename other columns
pampa=pampa.rename(columns={
    'Sample':'SMDC_ID',
    'Pe Well':'Destination well',
})

In [None]:
# add additional plate map columns
i=0
if i==0:
  try:
    pampa[['SMDC_ID','Lot']]=pampa.SMDC_ID.astype(str).str.split('-', expand=True)
  except:
    pampa['Lot']=np.nan
  pampa.Lot=pampa.Lot.astype(float)
  i=1

pampa['Source plate']=source_plate
pampa['Source well']=np.nan
pampa['Destination plate']=destination_plate
pampa['[compound] uM']=donor_well_conc

In [None]:
pampa.SMDC_ID.unique()

array(['DMSO', '813048', '1084034', '1084039', 'Theophylline', '1084028',
       '116927', '1084040', 'Verapamil', '1084029', '1084035', '83777',
       'Corticosterone', '1084030', '53984', '58764', '51059', '1084031',
       '1084036', '1084041', '1084026', '1084032', '1084037', '1084042',
       '51680', '1084033', '1084038', '1084278', '1084027', '116935',
       '181763', '1084279'], dtype=object)

In [None]:
# rename controls to SMDC_IDs and add lots
ctrl_dict={
    'Theophylline':254802,
    "Verapamil":131810,
    "Corticosterone":1076478,
    'DMSO':np.nan
}
ctrl_lot_dict={
    'Theophylline':2,
    "Verapamil":13,
    "Corticosterone":2,
}

pampa.loc[pampa.SMDC_ID=='Theophylline', '[compound] uM']=250

for smdc in ctrl_lot_dict:
  pampa.loc[pampa.SMDC_ID==smdc, 'Lot']=ctrl_lot_dict[smdc]
pampa=pampa.replace({'SMDC_ID':ctrl_dict})

pampa.SMDC_ID=pampa.SMDC_ID.astype(float)

pampa.SMDC_ID.unique()

array([     nan,  813048., 1084034., 1084039.,  254802., 1084028.,
        116927., 1084040.,  131810., 1084029., 1084035.,   83777.,
       1076478., 1084030.,   53984.,   58764.,   51059., 1084031.,
       1084036., 1084041., 1084026., 1084032., 1084037., 1084042.,
         51680., 1084033., 1084038., 1084278., 1084027.,  116935.,
        181763., 1084279.])

In [None]:
pampa.Lot.unique()

array([nan,  2., 13.])

In [None]:
# fix Pe and notes column
pampa.loc[pampa['P(10-6cm/s)']=='equilibrated','BCS code']='HIGH_EQ'
pampa['P(10-6cm/s)']=pampa['P(10-6cm/s)'].replace('undetected','').replace('equilibrated','')

In [None]:
pampa.head(12)

Unnamed: 0,SMDC_ID,Destination well,P(10-6cm/s),-logPe,pI,BCS code,Comment,pH,Lot,Source plate,Source well,Destination plate,[compound] uM
0,,A1,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
1,,A2,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
2,,A3,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
3,813048.0,A4,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
4,813048.0,A5,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
5,813048.0,A6,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
6,1084034.0,A7,51.67,4.287,3.0,HIGH,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
7,1084034.0,A8,39.311,4.405,3.0,HIGH,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
8,1084034.0,A9,,,1.0,HIGH_EQ,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50
9,1084039.0,A10,,,0.0,UND,,7.4,,OTPARP1-MA01,,20230427_ZA_Plate1_03,50


In [None]:
# save new data as csv
pampa.to_csv(outfile)