In [30]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [31]:
def scale_dataframe(df):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [32]:
def save_dataframe(df, file_path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [33]:
random.seed(18)
print(random.random())

0.18126486333322134


In [34]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [35]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [36]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000493,-0.058546,0.017772,0.031485,0.064924,-0.184546,0.087126,0.135644,-0.154001,-0.350043,...,-0.044138,-0.078265,-0.124109,-0.158990,0.014519,-0.113870,-0.112283,-0.177767,0.064080,Adult_Male
1,ACH-001402,-0.219824,-0.056013,-0.033793,0.170722,-0.048435,-0.198198,0.067420,0.072234,-0.096077,...,-0.067355,0.134976,0.190439,-0.088172,0.081297,-0.013322,-0.040583,-0.171905,-0.184679,Adult_Female
2,ACH-000182,-0.046497,-0.063304,0.045283,0.050625,-0.169463,-0.132277,0.038710,-0.012527,-0.012499,...,-0.053581,0.036305,0.083472,0.058510,-0.220670,-0.130092,0.002176,-0.113953,-0.465732,Unknown_Unknown
3,ACH-002687,-0.115078,-0.060383,0.128903,0.225909,-0.072605,0.062308,-0.064730,-0.187485,-0.104159,...,-0.697842,0.034597,0.122978,0.053704,-0.043281,-0.237439,-0.180194,-0.068624,-0.681598,Unknown_Female
4,ACH-001737,-0.145301,0.104647,0.082256,0.009729,0.107109,-0.015192,-0.072029,0.045129,0.138690,...,-0.302597,0.008596,0.113471,-0.184810,0.095783,-0.160119,0.001214,-0.427298,-0.047863,Pediatric_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-000302,-0.104711,-0.076339,-0.058086,-0.013366,-0.099378,-0.030453,0.024902,0.095911,-0.100080,...,-0.087587,-0.040326,0.042394,-0.084250,-0.090734,0.280032,-0.018050,-0.246463,-0.465200,Adult_Female
1146,ACH-001484,-0.086036,-0.054901,0.214454,0.051221,-0.155650,0.131712,0.189503,0.307411,0.082183,...,-0.267497,0.011082,0.128438,0.137536,-0.097451,-0.213779,0.029422,-0.213858,-0.151006,Adult_Female
1147,ACH-000172,-0.323562,-0.075648,0.087943,0.154574,0.149092,-0.178322,-0.067021,-0.492251,-0.031671,...,-0.699320,0.168781,0.065835,-0.130011,0.053945,-0.176800,-0.183588,-0.436515,-0.257045,Pediatric_Male
1148,ACH-002016,-0.032602,-0.035283,0.218523,0.098431,-0.071293,-0.114752,0.081615,0.034869,-0.228487,...,-0.344105,-0.135367,0.061430,-0.028888,0.036716,-0.457942,-0.034798,-0.232351,-0.592210,Adult_Female


In [37]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [38]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [39]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [40]:
#scale each dataframe
train_scaled_df = scale_dataframe(train_df)
test_scaled_df = scale_dataframe(test_df)
val_scaled_df = scale_dataframe(val_df)

In [41]:
#save each dataframe
save_dataframe(train_scaled_df, "../0.data-download/data/VAE_train_df.csv")
save_dataframe(test_scaled_df, "../0.data-download/data/VAE_test_df.csv")
save_dataframe(val_scaled_df, "../0.data-download/data/VAE_val_df.csv")

DataFrame saved to ../0.data-download/data/VAE_train_df.csv. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-000867  0.393729      0.588468  0.394263        0.497400   
1  ACH-001673  0.370708      0.620821  0.510124        0.529627   
2  ACH-000748  0.514124      0.654909  0.491309        0.579600   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0          0.485522        0.499825       0.573520     0.686206      0.667560   
1          0.496922        0.454240       0.571563     0.593982      0.644285   
2          0.681411        0.557961       0.518540     0.594385      0.592273   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...       0.349098     0.631795       0.559795      0.277612   
1  ...       0.432205     0.586232       0.712660      0.381303   
2  ...       0.733186     0.640741       0.594472      0.523323   

   ZYG11A (440590)  ZYG11B (79699)  ZYX (7791)  ZZEF1 (2