In [1]:
import pathlib

import pandas as pd

In [2]:
# set path to data
valid_data_file_path = pathlib.Path(
    "./raw/nELISA - Luminex comp in LPS stimulated PBMCs.xlsx"
).resolve()

# get sheet names
xls = pd.ExcelFile(valid_data_file_path)
sheet_names = xls.sheet_names
print(sheet_names)
nELISA = pd.read_excel(valid_data_file_path, sheet_name="nELISA_pgmL")
Luminex = pd.read_excel(valid_data_file_path, sheet_name="xMAP_pgmL")

['xMAP_pgmL', 'nELISA_pgmL']


In [3]:
print(nELISA.columns)
print(Luminex.columns)
Luminex.rename(columns={"IFNgamma": "IFN gamma", "TNFalpha": "TNF alpha"}, inplace=True)

Index(['LPS concentration', 'CCL2', 'GM-CSF', 'IFN gamma', 'IL-1 alpha',
       'IL-1 beta', 'IL-12 p40', 'TNF alpha'],
      dtype='object')
Index(['LPS concentration', 'CCL2', 'GM-CSF', 'IFNgamma', 'IL-1 alpha',
       'IL-1 beta', 'IL-12 p40', 'TNFalpha'],
      dtype='object')


In [4]:
# add a column to each dataframe to indicate the type of data
nELISA["data_type"] = "nELISA"
Luminex["data_type"] = "Luminex"

# combine the two dataframes
validation_data = pd.concat([nELISA, Luminex])
print(validation_data.shape)
# rename the columns to remove spaces
validation_data.columns = [col.replace(" ", "_") for col in validation_data.columns]
# cas
validation_data.head()

(46, 9)


Unnamed: 0,LPS_concentration,CCL2,GM-CSF,IFN_gamma,IL-1_alpha,IL-1_beta,IL-12_p40,TNF_alpha,data_type
0,0ng/mL,11060.82,71.65599,0.0,103.8391,814.2234,410.1418,1026.397,nELISA
1,0ng/mL,903.1002,68.41195,0.0,72.09225,512.2186,354.4981,745.6205,nELISA
2,0ng/mL,3832.716,76.71039,127.9428,77.38797,477.013,350.3133,825.1526,nELISA
3,0ng/mL,825.5349,0.0,0.0,106.735,735.8158,312.0023,1541.853,nELISA
4,0ng/mL,963.6282,164.8755,194.8401,143.5211,1105.563,748.3993,1830.145,nELISA


In [5]:
# min - max scaling
validation_data_min_max = validation_data.copy()

for col in validation_data_min_max.columns:
    if col not in ["LPS_concentration", "data_type"]:
        validation_data_min_max[col] = (
            validation_data_min_max[col] - validation_data_min_max[col].min()
        ) / (validation_data_min_max[col].max() - validation_data_min_max[col].min())

In [6]:
# convert both dataframes to long format
validation_data_long = pd.melt(
    validation_data,
    id_vars=["LPS_concentration", "data_type"],
    var_name="cytokine",
    value_name="concentration",
)
validation_data_min_max_long = pd.melt(
    validation_data_min_max,
    id_vars=["LPS_concentration", "data_type"],
    var_name="cytokine",
    value_name="concentration",
)

In [7]:
# write the data to a new file
output_file_path = pathlib.Path("./clean/validation/").resolve()
output_file_path.mkdir(parents=True, exist_ok=True)
output_file_path = pathlib.Path(
    "./clean/validation/nELISA_luminex_validation_data.parquet"
).resolve()
validation_data_long.to_parquet(output_file_path, index=False)

output_file_path = pathlib.Path("./clean/validation/").resolve()
output_file_path.mkdir(parents=True, exist_ok=True)
output_file_path = pathlib.Path(
    "./clean/validation/nELISA_luminex_validation_data_min_max.parquet"
).resolve()
validation_data_min_max_long.to_parquet(output_file_path, index=False)