In [35]:
import pandas as pd
import numpy as np

In [36]:
SG90 = pd.read_excel("SG90_TimWehnes_26082024.xlsx")
epigenetic_processed = pd.read_csv("Epigenetic_Processed.csv")
genus_clr = pd.read_csv("Genus_CLR.csv")
genus_raw = pd.read_csv("Genus_Raw.csv")

In [37]:
genus_clr["Sample_ID"] = genus_clr["Sample_ID"].astype(str).replace(r'^[A-Za-z]_', '', regex=True)
epigenetic_processed['Sample_ID'] = epigenetic_processed['Sample_ID'].astype(str)

#merge on overlapping samples, sample 80004 has had blood taken twice
df = pd.merge(genus_clr, epigenetic_processed, on='Sample_ID', suffixes=('', '_epigenetic'))

In [38]:
# Convert Sample_ID, subno, and alt_ID to strings for comparison
ids = df["Sample_ID"].astype(str)
SG90["subno"] = SG90["subno"].astype(str)
SG90["alt_ID"] = SG90["alt_ID"].dropna().astype(int).astype(str)

# Identify rows in SG90 where either subno or alt_ID matches Sample_ID in df
matching_rows = SG90[(SG90["subno"].isin(ids)) | (SG90["alt_ID"].isin(ids))].copy()

# Create a new 'Sample_ID' column in result_df
matching_rows["Sample_ID"] = matching_rows.apply(
    lambda row: row["subno"] if row["subno"] in ids.values else row["alt_ID"],
    axis=1
)

# Select specific columns by name
specific_columns = matching_rows[
    [ 
        'demogr_sex',
        'demogr_race',
        "ansur_frax_bmi",
        "subs_use_smoke_consolidated",
        "others_Blood_collection_date", 
        "others_Saliva_collection_date",
    ]
]

# Concatenate all the selected columns
result_df = pd.concat([matching_rows[["Sample_ID"]], specific_columns], axis=1)
# Merge into final df
merged_df = pd.merge(df, result_df, on="Sample_ID", how="left")
#merged_df

In [39]:
# Identify duplicate Sample_ID values in merged_df
duplicate_sample_ids = merged_df[merged_df.duplicated('Sample_ID', keep=False)]

#rows 0,1,3,7 removed bc of likely wrong collection date - not possible that it has been collected twice since measurements are identical
merged_df = merged_df.drop([0, 1, 3, 7])
merged_df = merged_df.reset_index(drop=True)

# Check if blood and saliva have equal dates
# Identify rows where the dates are not equal in merged_df
unequal_dates_rows = merged_df[merged_df['others_Blood_collection_date'] != merged_df['others_Saliva_collection_date']]

# remove the 7rows where the dates are not equal
merged_df = merged_df.drop(unequal_dates_rows.index)
merged_df = merged_df.reset_index(drop=True)

df = merged_df.drop(columns=["Date_of_blood_collection", "others_Saliva_collection_date", "others_Blood_collection_date"])

In [40]:
#use median imputation for BMI and Smoking

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['ansur_frax_bmi'].quantile(0.25)
Q3 = df['ansur_frax_bmi'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - (100 * IQR)
upper_bound = Q3 + (100 * IQR)

# Find outliers: rows where 'ansur_frax_bmi' is either below the lower bound or above the upper bound
outliers = (df['ansur_frax_bmi'] < lower_bound) | (df['ansur_frax_bmi'] > upper_bound)

# Compute the median of the column
median_value = df['ansur_frax_bmi'].median()

# Replace outliers with the median
df.loc[outliers, 'ansur_frax_bmi'] = median_value

# Replace NaN values with the median as well
df['ansur_frax_bmi'].fillna(median_value, inplace=True)

median_value = df['subs_use_smoke_consolidated'].median()
# Fill NA values with the median
df['subs_use_smoke_consolidated'].fillna(median_value, inplace=True)

In [41]:
df.to_csv("Table1_Input.csv", index = False)

In [42]:
#same for the raw genus data
genus_raw["Sample_ID"] = genus_raw["Sample_ID"].astype(str).replace(r'^[A-Za-z]_', '', regex=True)
epigenetic_processed['Sample_ID'] = epigenetic_processed['Sample_ID'].astype(str)

#merge on overlapping samples, sample 80004 has had blood taken twice
df = pd.merge(genus_raw, epigenetic_processed, on='Sample_ID', suffixes=('', '_epigenetic'))

# Convert Sample_ID, subno, and alt_ID to strings for comparison
ids = df["Sample_ID"].astype(str)
SG90["subno"] = SG90["subno"].astype(str)
SG90["alt_ID"] = SG90["alt_ID"].dropna().astype(int).astype(str)

# Identify rows in SG90 where either subno or alt_ID matches Sample_ID in df
matching_rows = SG90[(SG90["subno"].isin(ids)) | (SG90["alt_ID"].isin(ids))].copy()

# Create a new 'Sample_ID' column in result_df
matching_rows["Sample_ID"] = matching_rows.apply(
    lambda row: row["subno"] if row["subno"] in ids.values else row["alt_ID"],
    axis=1
)

# Select specific columns by name
specific_columns = matching_rows[
    [ 
        'demogr_sex',
        'demogr_race',
        "ansur_frax_bmi",
        "subs_use_smoke_consolidated",
        "others_Blood_collection_date", 
        "others_Saliva_collection_date",
    ]
]

# Concatenate all the selected columns
result_df = pd.concat([matching_rows[["Sample_ID"]], specific_columns], axis=1)
# Merge into final df
merged_df = pd.merge(df, result_df, on="Sample_ID", how="left")
#merged_df

# Identify duplicate Sample_ID values in merged_df
duplicate_sample_ids = merged_df[merged_df.duplicated('Sample_ID', keep=False)]

#rows 0,1,3,7 removed bc of likely wrong collection date - not possible that it has been collected twice since measurements are identical
merged_df = merged_df.drop([0, 1, 3, 7])
merged_df = merged_df.reset_index(drop=True)

# Check if blood and saliva have equal dates
# Identify rows where the dates are not equal in merged_df
unequal_dates_rows = merged_df[merged_df['others_Blood_collection_date'] != merged_df['others_Saliva_collection_date']]

# remove the 7rows where the dates are not equal
merged_df = merged_df.drop(unequal_dates_rows.index)
merged_df = merged_df.reset_index(drop=True)

df = merged_df.drop(columns=["Date_of_blood_collection", "others_Saliva_collection_date", "others_Blood_collection_date"])

#use median imputation for BMI and Smoking

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['ansur_frax_bmi'].quantile(0.25)
Q3 = df['ansur_frax_bmi'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - (100 * IQR)
upper_bound = Q3 + (100 * IQR)

# Find outliers: rows where 'ansur_frax_bmi' is either below the lower bound or above the upper bound
outliers = (df['ansur_frax_bmi'] < lower_bound) | (df['ansur_frax_bmi'] > upper_bound)

# Compute the median of the column
median_value = df['ansur_frax_bmi'].median()

# Replace outliers with the median
df.loc[outliers, 'ansur_frax_bmi'] = median_value

# Replace NaN values with the median as well
df['ansur_frax_bmi'].fillna(median_value, inplace=True)

median_value = df['subs_use_smoke_consolidated'].median()
# Fill NA values with the median
df['subs_use_smoke_consolidated'].fillna(median_value, inplace=True)

df.rename(columns={
    'Age_at_blood_collection': 'Age', 
    'demogr_sex': 'Sex',
    'demogr_race': "Race",
    "ansur_frax_bmi": "BMI",
    "subs_use_smoke_consolidated": "Smoking",
    
}, inplace=True)

# Define your updated columns and group definition
df['Group'] = df['Epigenetic_deviation'].apply(lambda x: 'Worse' if x >= np.median(df["Epigenetic_deviation"]) else 'Better')

# List of columns to remove
to_remove = [
    'Epigenetic_average'
]

# Remove the specified columns from the DataFrame
df = df.drop(columns=to_remove)

df_encoded = pd.get_dummies(df, columns=['Race', 'Sex', 'Smoking'], drop_first=False)  # One-hot encode categorical variables
# Replace True with 1 and False with 0 in the entire DataFrame
df = df_encoded.replace({True: 1, False: 0})
df

df.to_csv("FINAL_GENUS_TAXA_RAW.csv", index=False)