In [13]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../utils/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [14]:
def scale_dataframe(df: pd.DataFrame):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [15]:
def save_dataframe(df, file_path: pathlib.Path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [16]:
random.seed(18)
print(random.random())

0.18126486333322134


In [17]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [18]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [19]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001385,-0.033696,0.017075,0.146826,0.096040,-0.289019,-0.116759,-0.036558,-0.287216,-0.206783,...,-0.246695,-0.050301,0.192910,-0.119050,0.016471,-0.167245,-0.031136,-0.264343,-0.244525,Adult_Male
1,ACH-000124,-0.419191,0.025918,-0.067638,0.092051,-0.123343,-0.289495,0.108037,-0.129337,0.020937,...,-0.416031,0.124124,0.082636,0.024370,0.097702,-0.069470,0.084096,-0.050329,-0.113807,Adult_Female
2,ACH-001842,0.018640,-0.046962,-0.071591,0.282715,0.100422,-0.058401,-0.112444,0.187943,0.004529,...,-0.144318,-0.205781,0.025178,0.067121,0.101029,-0.056995,0.080882,-0.126190,0.009628,Adult_Female
3,ACH-000313,-0.110087,-0.171701,0.055075,0.065709,-0.179664,-0.035115,0.063535,0.033544,-0.001349,...,-0.169953,0.124755,0.255071,-0.036485,-0.057930,-0.179545,-0.021921,-0.241119,-0.132658,Adult_Male
4,ACH-001461,0.120377,-0.017472,-0.016559,0.201023,0.025631,-0.193421,0.216241,0.142920,0.140057,...,-0.626472,-0.030726,0.034190,-0.156965,-0.151589,-0.225340,0.206879,-0.184467,-0.227124,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-000551,-0.114328,-0.182774,-0.057828,0.193458,0.113254,0.001815,-0.030578,0.010067,0.054070,...,-0.121850,0.063811,-0.293473,0.011840,0.182085,-0.132752,0.088567,0.083653,-0.207638,Adult_Female
1146,ACH-002263,-0.032011,0.191508,-0.018130,-0.062402,-0.122800,-0.010477,-0.031259,0.043129,-0.007343,...,-0.532985,0.150843,0.062816,0.045514,-0.071133,-0.142275,-0.211227,-0.057489,-0.242971,Adult_Male
1147,ACH-000938,-0.146363,-0.097466,-0.062443,-0.041653,-0.231802,0.051133,-0.014263,0.370785,-0.022151,...,-0.430270,-0.007801,0.122278,0.023827,-0.042040,-0.143697,0.147171,0.008195,-0.219349,Adult_Male
1148,ACH-001857,0.021411,-0.077184,-0.102119,0.182852,-0.281109,-0.241205,-0.021904,-0.429176,0.107432,...,-0.450515,0.107849,0.162657,0.238613,0.083189,-0.070850,0.019490,-0.288993,-0.306299,Adult_Male


In [20]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [21]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [22]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [27]:
#save each dataframe
save_dataframe(train_df, pathlib.Path("../0.data-download/data/VAE_train_df.csv").resolve())
save_dataframe(test_df, pathlib.Path("../0.data-download/data/VAE_test_df.csv").resolve())
save_dataframe(val_df, pathlib.Path("../0.data-download/data/VAE_val_df.csv").resolve())

DataFrame saved to /home/juliacurd/gene_dependency_representations/0.data-download/data/VAE_train_df.csv. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-001054 -0.148523      0.066810  0.078134        0.168272   
1  ACH-000015 -0.153272     -0.039229 -0.064208        0.079199   
2  ACH-001961 -0.069971     -0.068830  0.032941        0.064824   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0         -0.183760        0.045006       0.076497    -0.019024     -0.123016   
1         -0.068063       -0.029462       0.057602    -0.066563      0.021617   
2         -0.091528       -0.066410      -0.008933    -0.048042     -0.171945   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...      -0.461615     0.050362       0.048965      0.149117   
1  ...      -0.183974     0.026359       0.059910      0.025818   
2  ...      -0.547815     0.006207       0.163901     -0.126165   

   ZYG11A (