In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../utils/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
def scale_dataframe(df: pd.DataFrame):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [3]:
def save_dataframe(df, file_path: pathlib.Path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_parquet(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [4]:
random.seed(18)
print(random.random())

0.18126486333322134


In [5]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [6]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [7]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000834,-0.045891,-0.260131,0.009378,-0.010856,-0.036821,0.072006,0.046686,0.048228,-0.099161,...,-0.490826,0.158543,0.154925,-0.151508,-0.091955,-0.169867,0.156774,-0.116749,-0.432911,Adult_Male
1,ACH-000364,-0.083886,0.106509,0.010381,-0.092879,-0.075700,-0.057652,0.114636,-0.159514,-0.070576,...,-0.579386,0.114572,0.069922,-0.049669,-0.199004,-0.081043,0.092593,-0.284636,-0.527344,Pediatric_Female
2,ACH-000039,0.091020,0.055004,0.002414,0.386573,0.128021,-0.419384,0.057576,0.179198,0.143718,...,-0.383486,-0.039956,0.123962,-0.274982,-0.160583,-0.231255,-0.002348,-0.058138,-0.335914,Pediatric_Female
3,ACH-000344,0.054475,-0.018356,0.004339,-0.037338,0.009673,0.021140,-0.067017,-0.470077,-0.211834,...,-0.091974,0.171965,0.157967,-0.315210,-0.147117,-0.129271,-0.143784,-0.195919,-0.453655,Adult_Male
4,ACH-002471,-0.052578,0.056931,0.099929,0.131865,-0.075399,-0.129117,-0.037312,-0.075644,0.008808,...,-0.606958,-0.007913,-0.050324,-0.220839,-0.047250,-0.307309,-0.330117,-0.269755,-0.116892,Pediatric_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-001458,-0.033175,-0.140354,0.066432,0.053360,0.008219,-0.149164,0.134524,0.013129,0.061871,...,-0.616696,0.020156,-0.036229,-0.246281,-0.124635,0.203406,-0.068499,-0.195063,-0.133284,Adult_Male
1146,ACH-000317,-0.065253,-0.225588,0.042695,-0.073407,-0.299106,-0.028249,-0.061190,0.001742,0.021838,...,-0.249519,-0.030013,0.149905,-0.071041,-0.078398,-0.022720,-0.116713,0.113660,-0.204034,Unknown_Male
1147,ACH-000858,-0.018818,-0.077132,0.046336,0.028847,-0.244095,-0.099547,0.032263,-0.084803,0.011631,...,-0.491082,0.036642,0.054586,-0.092162,-0.030740,-0.172903,-0.028086,-0.199751,-0.536555,Adult_Male
1148,ACH-001799,-0.035877,-0.085510,0.006087,-0.008545,-0.052645,-0.029418,-0.033314,-0.355298,-0.223414,...,-0.312418,0.053972,0.060468,-0.223187,0.156892,-0.012084,-0.022939,-0.090674,-0.737976,Unknown_Male


In [8]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [9]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [10]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [11]:
#save each dataframe
save_dataframe(train_df, pathlib.Path("../0.data-download/data/VAE_train_df.parquet").resolve())
save_dataframe(test_df, pathlib.Path("../0.data-download/data/VAE_test_df.parquet").resolve())
save_dataframe(val_df, pathlib.Path("../0.data-download/data/VAE_val_df.parquet").resolve())

DataFrame saved to /home/juliacurd/gene_dependency_representations/0.data-download/data/VAE_train_df.parquet. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-001578 -0.251242      0.045095  0.255223       -0.007852   
1  ACH-000569 -0.236197      0.037577 -0.056611        0.180552   
2  ACH-000963 -0.211905     -0.103702  0.031513        0.133329   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0         -0.112987        0.167671      -0.048606    -0.164626      0.170724   
1         -0.073225       -0.338064      -0.002834     0.075166     -0.169582   
2          0.032130        0.139724      -0.013841    -0.111958     -0.109355   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...      -0.447906    -0.069794       0.019615     -0.327797   
1  ...      -0.611104    -0.187078       0.189025     -0.156478   
2  ...      -0.474096     0.225580       0.255745      0.039617   

   ZYG1

In [16]:
# create a data frame of both test and train gene effect data with sex, AgeCategory, and ModelID for use in later t-tests
# load in the data

# create dataframe containing the genes that passed an initial QC (see Pan et al. 2022) and a saturated signal qc, then extracting their corresponding gene label
gene_dict_df = pd.read_parquet("../0.data-download/data/CRISPR_gene_dictionary.parquet")
gene_list_passed_qc = gene_dict_df.loc[gene_dict_df["qc_pass"], 'dependency_column'].tolist()
concat_frames = [train_df, test_df, val_df]
train_and_test = pd.concat(concat_frames).reset_index(drop=True)
train_and_test[["AgeCategory", "Sex"]] = train_and_test.age_and_sex.str.split(
    pat="_", expand=True
)
train_and_test_subbed = train_and_test.filter(gene_list_passed_qc, axis=1)
metadata_holder = pd.DataFrame()
metadata = metadata_holder.assign(
    ModelID=train_and_test.ModelID.astype(str),
    AgeCategory=train_and_test.AgeCategory.astype(str),
    Sex=train_and_test.Sex.astype(str),
)

metadata_df_dir = pathlib.Path("../0.data-download/data/metadata_df.parquet").resolve()
metadata.to_parquet(metadata_df_dir, index=False)

train_and_test_subbed_dir = pathlib.Path("../0.data-download/data/train_and_test_subbed.parquet").resolve()
train_and_test_subbed.to_parquet(train_and_test_subbed_dir, index=False)