In [2]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../utils/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [3]:
def scale_dataframe(df: pd.DataFrame):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [4]:
def save_dataframe(df, file_path: pathlib.Path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [5]:
random.seed(18)
print(random.random())

0.18126486333322134


In [6]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [7]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [8]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000865,-0.173026,-0.254294,0.004853,0.026911,-0.229387,-0.016943,0.064717,-0.008996,0.037373,...,-0.714714,0.088009,0.111083,-0.025838,-0.086866,-0.223841,-0.042248,-0.158515,-0.183974,Adult_Male
1,ACH-001329,-0.024604,-0.058696,-0.013679,0.089342,-0.150608,-0.086011,0.056567,-0.461169,-0.046079,...,-0.909165,-0.010495,-0.154873,0.003711,-0.167647,-0.166147,-0.042581,-0.101992,-0.450968,Adult_Male
2,ACH-000375,-0.342829,-0.206546,-0.173196,-0.084770,0.051858,-0.022549,0.018175,-0.175771,-0.047496,...,-0.475806,0.147000,0.101993,-0.059408,0.051686,-0.159170,-0.488926,0.069367,-0.170899,Pediatric_Female
3,ACH-001164,-0.072382,-0.029595,-0.000907,0.119970,-0.234203,-0.069069,0.116190,-0.138632,-0.153715,...,-0.486658,-0.113446,0.154238,-0.014333,0.041338,0.016250,-0.066129,-0.256965,-0.322924,Pediatric_Male
4,ACH-002150,-0.091693,0.092776,-0.118559,0.103001,0.023718,-0.011905,0.029442,0.069144,-0.038186,...,-0.609698,-0.100207,0.120595,0.038691,0.108734,-0.155888,-0.045160,-0.068913,-0.132496,Unknown_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-001197,-0.048508,0.027538,0.175623,0.082623,0.046186,-0.026900,0.200879,-0.192553,-0.057497,...,-0.564985,0.027631,0.283310,0.023426,-0.013358,-0.170790,0.133510,-0.150341,-0.074399,Adult_Male
1146,ACH-001413,0.019664,-0.039458,0.055541,0.137968,0.069240,0.004567,0.007904,-0.046859,-0.501705,...,-0.040322,0.009482,0.112591,0.100675,-0.002592,-0.223510,0.018157,-0.325062,-0.485400,Unknown_Male
1147,ACH-001651,0.048000,0.039982,-0.085284,0.233460,-0.245464,-0.204878,0.136785,0.090870,0.002244,...,-0.092888,-0.093866,0.015143,0.018816,0.044929,-0.020291,0.029482,0.052510,-0.261161,Adult_Female
1148,ACH-002119,-0.172016,-0.066700,-0.086895,0.090434,-0.245656,-0.293576,0.249400,-0.224420,-0.004074,...,-0.319515,0.021723,-0.057332,-0.139326,-0.080462,-0.093371,0.094401,-0.218046,-0.393180,Adult_Female


In [9]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [10]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [11]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [12]:
#save each dataframe
save_dataframe(train_df, pathlib.Path("../0.data-download/data/VAE_train_df.csv").resolve())
save_dataframe(test_df, pathlib.Path("../0.data-download/data/VAE_test_df.csv").resolve())
save_dataframe(val_df, pathlib.Path("../0.data-download/data/VAE_val_df.csv").resolve())

DataFrame saved to /home/juliacurd/gene_dependency_representations/0.data-download/data/VAE_train_df.csv. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-000515 -0.066802     -0.088775 -0.124325       -0.056742   
1  ACH-000159 -0.098768     -0.155050  0.051113       -0.040915   
2  ACH-000672  0.076061      0.059420 -0.251454        0.251579   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0         -0.319577       -0.131226      -0.149453    -0.420217      0.068372   
1          0.018961       -0.018802       0.125073    -0.063100     -0.049539   
2         -0.247216       -0.168501       0.009697     0.143189      0.065436   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...      -0.557861     0.151685       0.433092      0.035110   
1  ...      -0.742240     0.020520      -0.038917     -0.187451   
2  ...      -0.800085     0.049857      -0.011108      0.331004   

   ZYG11A (

In [13]:
# create a data frame of both test and train gene effect data with sex, AgeCategory, and ModelID for use in later t-tests
# load in the data

# create dataframe containing the genes that passed an initial QC (see Pan et al. 2022) and a saturated signal qc, then extracting their corresponding gene label
gene_dict_df = pd.read_csv("../0.data-download/data/CRISPR_gene_dictionary.tsv", delimiter='\t')
gene_list_passed_qc = gene_dict_df.query("qc_pass").dependency_column.tolist()
concat_frames = [train_df, test_df, val_df]
train_and_test = pd.concat(concat_frames).reset_index(drop=True)
train_and_test[["AgeCategory", "Sex"]] = train_and_test.age_and_sex.str.split(
    pat="_", expand=True
)
train_and_test_subbed = train_and_test.filter(gene_list_passed_qc, axis=1)
metadata_holder = []
metadata_holder = pd.DataFrame(metadata_holder)
metadata = metadata_holder.assign(
    ModelID=train_and_test.ModelID.astype(str),
    AgeCategory=train_and_test.AgeCategory.astype(str),
    Sex=train_and_test.Sex.astype(str),
    #train_or_test=train_and_test.train_or_test.astype(str),
)

metadata_df_dir = pathlib.Path("../0.data-download/data/metadata_df.csv")
metadata.to_csv(metadata_df_dir, index=False)
metadata

train_and_test_subbed_dir = pathlib.Path("../0.data-download/data/train_and_test_subbed.csv")
train_and_test_subbed.to_csv(train_and_test_subbed_dir, index=False)
train_and_test_subbed

Unnamed: 0,AAAS (8086),AAMP (14),AARS1 (16),AARS2 (57505),AASDHPPT (60496),AATF (26574),ABCE1 (6059),ABCF1 (23),ABI1 (10006),ABL1 (25),...,ZNHIT3 (9326),ZNHIT6 (54680),ZNRD2 (10534),ZPR1 (8882),ZRANB1 (54764),ZSWIM6 (57688),ZW10 (9183),ZWILCH (55055),ZWINT (11130),ZZZ3 (26009)
0,-0.420217,-1.107865,-1.328506,-0.284800,-0.525963,-0.666578,-2.650665,-1.012824,-0.067677,0.308153,...,-0.430782,-1.226848,-0.131104,-1.283069,0.198508,-0.026694,-0.009742,-0.147676,-0.557861,-0.140720
1,-0.063100,-1.500541,-2.064879,-0.058868,-0.067331,-0.844047,-2.080516,-1.386412,-0.009080,0.007949,...,-0.242964,-0.724528,-0.222876,-1.089151,-0.175052,-0.091769,-0.331101,-0.240877,-0.742240,-0.286115
2,0.143189,-1.178453,-2.034627,-0.790103,-0.178241,-0.980939,-2.085015,-1.099721,-0.171120,0.236248,...,-0.529004,-1.156898,-0.670518,-1.460009,-0.053979,0.167360,-0.436056,-0.122846,-0.800085,-0.392985
3,0.360417,-1.259483,-2.083582,-0.459909,-0.213036,-0.665047,-1.670425,-0.814860,-0.098529,0.287933,...,-0.782124,-0.591378,-0.097318,-1.491589,-0.281662,-0.124752,-0.380941,-0.448987,-0.748257,-0.252997
4,-0.126643,-1.497363,-2.140543,-0.909022,-0.515954,-0.738531,-1.955625,-0.965299,0.016911,-0.136527,...,-1.185696,-0.861337,-0.213164,-1.389103,-0.066738,0.000667,-0.349421,-0.544139,-0.782085,-0.089104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,-0.207100,-0.885544,-1.261387,-0.634652,-0.349590,-0.948309,-1.837167,-0.760303,-0.139052,-2.056599,...,-0.415304,-0.752014,-0.411975,-1.016120,-0.159813,-0.079206,-0.037932,-0.252008,-0.231732,-0.180437
954,-0.009876,-0.861676,-2.238538,-0.048439,0.017032,-0.753718,-2.294193,-1.115864,-0.002805,0.088862,...,-0.421306,-0.957070,-0.478247,-1.043137,-0.375796,-0.086770,0.095568,0.110121,-0.137089,-0.327574
955,0.370785,-1.480761,-1.570118,-0.464977,-0.267730,-0.906571,-2.641930,-0.675582,0.096804,-0.058625,...,-0.339309,-0.872743,-0.176139,-1.215292,-0.209620,0.091771,-0.182590,-0.250381,-0.430270,-0.219349
956,-0.043020,-1.149343,-1.711245,-0.097014,-0.106222,-0.680678,-2.149480,-0.812047,0.072176,0.232930,...,-0.076095,-0.665779,-0.335089,-1.334864,-0.140204,-0.280853,-0.118061,-0.210868,-0.571572,-0.385754
