In [4]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../utils/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [5]:
def scale_dataframe(df: pd.DataFrame):
    """
    Scales the gene effect data columns of a DataFrame to a 0-1 range.
    The first column (ID) and the last two columns (age and sex) are not scaled.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The scaled DataFrame.
    """
    col_num = df.shape[1]
    df_to_scale = df.iloc[:, 1:col_num-1]
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_df = scaler.fit_transform(df_to_scale)
    
    scaled_df = pd.DataFrame(scaled_df)
    scaled_df.insert(0, df.columns[0], df[df.columns[0]])
    scaled_df.insert(col_num-1, df.columns[col_num-1], df[df.columns[col_num-1]])
    scaled_df.columns = df.columns
    
    return scaled_df

In [6]:
def save_dataframe(df, file_path: pathlib.Path):
    """
    Saves a DataFrame to a specified file path.

    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path to save the DataFrame.
    """
    df = df.reset_index(drop=True)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}. Shape: {df.shape}")
    print(df.head(3))

In [7]:
random.seed(18)
print(random.random())

0.18126486333322134


In [8]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [9]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [10]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000217,0.026209,-0.155361,0.038405,0.067294,-0.328356,-0.062923,0.028699,-0.068602,-0.039810,...,-0.114723,0.070043,-0.023054,0.006781,-0.063557,-0.183658,0.066746,-0.113198,-0.297841,Adult_Female
1,ACH-001270,-0.125700,-0.190342,0.034115,0.114257,0.041126,-0.207532,-0.137155,0.258718,0.117235,...,-0.266138,0.022924,0.021368,-0.096022,0.126506,-0.068497,0.075919,-0.352637,-0.662216,Pediatric_Male
2,ACH-000665,-0.128845,-0.123627,0.217786,0.076074,-0.108984,0.046124,0.038975,-0.277239,0.091650,...,-0.464727,0.059420,0.008851,-0.140457,-0.074054,-0.278414,0.079461,-0.186773,-0.475611,Adult_Male
3,ACH-000040,-0.114306,-0.038925,-0.124451,-0.059078,-0.150314,0.029112,-0.032038,0.063007,0.212694,...,-0.341150,-0.002181,0.197902,-0.144776,-0.133535,-0.125042,0.035534,-0.237621,-0.355940,Adult_Male
4,ACH-001538,-0.085269,-0.057841,-0.090274,0.123304,-0.207395,-0.080852,-0.037714,-0.039654,0.073886,...,-0.406865,0.046642,-0.023465,0.119561,-0.094717,-0.247352,0.021400,-0.214985,-0.436243,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-001075,-0.206734,0.051884,-0.031207,-0.044838,-0.186872,-0.005355,0.062855,-0.113544,0.051723,...,-0.236450,0.252012,0.423887,-0.009196,0.006980,-0.239187,0.067075,-0.009195,-0.422094,Adult_Female
1146,ACH-001525,-0.102617,-0.180084,0.053004,0.117906,-0.031695,-0.261150,-0.110685,-0.054434,-0.162697,...,-0.706123,0.111336,0.085569,0.011036,-0.043573,-0.169674,0.122167,-0.250975,-0.494760,Adult_Female
1147,ACH-000455,-0.025397,-0.106317,0.038565,0.050108,-0.208824,-0.159086,0.091599,-0.090665,0.028743,...,-0.511418,-0.310188,-0.149804,-0.058541,-0.000196,-0.164163,-0.091798,-0.207863,-0.217517,Adult_Male
1148,ACH-000619,-0.176008,-0.165673,0.018749,0.142338,-0.078119,0.049097,0.093431,-0.016408,-0.148808,...,-0.469174,0.127521,0.114103,-0.092413,0.010859,-0.109446,0.189834,-0.145411,-0.306469,Adult_Male


In [11]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [12]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [13]:
# split the data based on age category and sex
train_df, testandvalidation_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
testandvalidation_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    testandvalidation_df, test_size=0.5
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [14]:
#scale each dataframe
train_scaled_df = scale_dataframe(train_df)
test_scaled_df = scale_dataframe(test_df)
val_scaled_df = scale_dataframe(val_df)

In [15]:
#save each dataframe
save_dataframe(train_scaled_df, pathlib.Path("../0.data-download/data/VAE_train_df.csv").resolve())
save_dataframe(test_scaled_df, pathlib.Path("../0.data-download/data/VAE_test_df.csv").resolve())
save_dataframe(val_scaled_df, pathlib.Path("../0.data-download/data/VAE_val_df.csv").resolve())

DataFrame saved to ../0.data-download/data/VAE_train_df.csv. Shape: (670, 17109)
      ModelID  A1BG (1)  A1CF (29974)   A2M (2)  A2ML1 (144568)  \
0  ACH-000128  0.548047      0.480271  0.502003        0.519879   
1  ACH-000422  0.638218      0.548044  0.712177        0.411060   
2  ACH-002048  0.425904      0.474357  0.600495        0.628317   

   A3GALT2 (127550)  A4GALT (53947)  A4GNT (51146)  AAAS (8086)  AACS (65985)  \
0          0.697124        0.565042       0.363295     0.582600      0.728632   
1          0.592794        0.465297       0.307112     0.568652      0.633672   
2          0.661065        0.450390       0.348097     0.608687      0.664541   

   ...  ZWINT (11130)  ZXDA (7789)  ZXDB (158586)  ZXDC (79364)  \
0  ...       0.520232     0.519188       0.580067      0.546760   
1  ...       0.639102     0.746709       0.651777      0.322917   
2  ...       0.426195     0.576058       0.625954      0.489699   

   ZYG11A (440590)  ZYG11B (79699)  ZYX (7791)  ZZEF1 (2