In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../utils/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
def scale_dataframe(df: pd.DataFrame):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [3]:
def save_dataframe(df, file_path: pathlib.Path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [4]:
random.seed(18)
print(random.random())

0.18126486333322134


In [5]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [6]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [7]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002660,-0.024873,-0.046235,0.200516,-0.103352,-0.743550,-0.145637,0.288817,-0.080504,-0.219343,...,-0.641693,0.222955,0.450797,-0.100757,0.047159,-0.223267,-0.107483,-0.301398,0.208242,Adult_Female
1,ACH-000770,-0.105015,-0.118278,0.013666,0.024034,-0.063262,-0.097246,-0.036911,-0.066317,-0.081993,...,-0.611432,-0.040385,0.022289,-0.087003,0.009104,-0.079456,0.031087,-0.017750,-0.323080,Pediatric_Male
2,ACH-000012,-0.139877,-0.002344,0.011211,0.048813,0.049770,-0.179609,0.113692,-0.039238,-0.069528,...,-0.623566,0.089670,-0.002416,-0.081465,0.061536,-0.060163,-0.091458,-0.157398,-0.233239,Adult_Female
3,ACH-000820,-0.140939,-0.000732,-0.010750,0.145126,-0.077734,0.082705,0.255306,-0.308653,-0.242762,...,-0.577795,0.181444,-0.023513,-0.456738,-0.058880,-0.003008,-0.098059,-0.179341,-0.384057,Adult_Female
4,ACH-002084,-0.058266,0.059411,-0.009038,0.164217,-0.130604,-0.209063,0.055217,-0.399732,-0.310741,...,-0.357145,0.038031,0.304715,-0.181249,-0.049531,-0.194468,-0.122039,-0.259563,-0.066642,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-000607,-0.013597,-0.048008,0.175440,-0.055725,-0.263963,-0.009386,-0.004588,-0.272526,-0.002379,...,-0.677284,-0.056939,-0.142240,-0.310165,-0.075264,-0.226676,-0.163162,-0.193660,-0.499425,Pediatric_Male
1146,ACH-002460,-0.241597,-0.169607,-0.083953,0.046550,-0.073913,0.029102,-0.261187,0.003923,-0.163708,...,-0.497182,0.164495,0.271731,-0.102921,0.049382,-0.241658,-0.153427,-0.297802,-0.254467,Adult_Male
1147,ACH-001054,-0.148523,0.066810,0.078134,0.168272,-0.183760,0.045006,0.076497,-0.019024,-0.123016,...,-0.461615,0.050362,0.048965,0.149117,-0.006584,-0.307776,-0.069039,-0.049377,-0.375725,Pediatric_Male
1148,ACH-000571,-0.142152,-0.154697,-0.026882,0.097141,-0.104495,-0.029438,-0.037886,-0.098708,-0.052308,...,-0.419723,0.124319,0.141697,0.038131,-0.240731,-0.229773,-0.026739,-0.098424,-0.236172,Adult_Male


In [8]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [9]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [10]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [11]:
#save each dataframe
save_dataframe(train_df, pathlib.Path("../0.data-download/data/VAE_train_df.csv").resolve())
save_dataframe(test_df, pathlib.Path("../0.data-download/data/VAE_test_df.csv").resolve())
save_dataframe(val_df, pathlib.Path("../0.data-download/data/VAE_val_df.csv").resolve())

DataFrame saved to /home/juliacurd/gene_dependency_representations/0.data-download/data/VAE_train_df.csv. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-001526  0.159899     -0.210365  0.100320        0.091808   
1  ACH-000045  0.000613      0.042093  0.035680        0.101065   
2  ACH-001431  0.126546     -0.126795  0.073458       -0.029680   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0          0.371547        0.079559      -0.036675     0.080594      0.015678   
1         -0.014226       -0.081459      -0.095948    -0.255931     -0.159878   
2         -0.188073       -0.341536       0.060515    -0.058629      0.006010   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...      -0.621108    -0.047940      -0.005144      0.015620   
1  ...      -0.518228     0.097455       0.199169     -0.070343   
2  ...      -0.565064    -0.031034      -0.126097     -0.125018   

   ZYG11A (

In [13]:
# create a data frame of both test and train gene effect data with sex, AgeCategory, and ModelID for use in later t-tests
# load in the data

# create dataframe containing the genes that passed an initial QC (see Pan et al. 2022) and a saturated signal qc, then extracting their corresponding gene label
gene_dict_df = pd.read_csv("../0.data-download/data/CRISPR_gene_dictionary.tsv", delimiter='\t')
gene_list_passed_qc = gene_dict_df.loc[gene_dict_df["qc_pass"], 'dependency_column'].tolist()
concat_frames = [train_df, test_df, val_df]
train_and_test = pd.concat(concat_frames).reset_index(drop=True)
train_and_test[["AgeCategory", "Sex"]] = train_and_test.age_and_sex.str.split(
    pat="_", expand=True
)
train_and_test_subbed = train_and_test.filter(gene_list_passed_qc, axis=1)
metadata_holder = pd.DataFrame()
metadata = metadata_holder.assign(
    ModelID=train_and_test.ModelID.astype(str),
    AgeCategory=train_and_test.AgeCategory.astype(str),
    Sex=train_and_test.Sex.astype(str),
)

metadata_df_dir = pathlib.Path("../0.data-download/data/metadata_df.csv").resolve()
metadata.to_csv(metadata_df_dir, index=False)

train_and_test_subbed_dir = pathlib.Path("../0.data-download/data/train_and_test_subbed.csv").resolve()
train_and_test_subbed.to_csv(train_and_test_subbed_dir, index=False)