This script checks for correlation between PCs and Covariates to uncover if a PC actually displays the information of a covariate.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_covariates = pd.read_csv("/dhc/groups/mpws2022cl1/input/filtered_biobank_check_cov_corr.csv")
df_pcs  = pd.read_csv("/dhc/groups/mpws2022cl1/output/50000_GRAY_50_minMaxScaling_ef_mc_2023_02_16_15_53_44_2023_03_06_11_29_52/resnet50_50_minMaxScaling_ef_mc_2023_02_16_15_53_44_L4.txt", sep=' ')

In [None]:
# Check data types
print(df_covariates.dtypes)

# Check for missing values
print(df_covariates.isnull().sum())

# Summary statistics
print(df_covariates.describe())

# Histograms and box plots
for col in df_covariates.columns:
    if df_covariates[col].dtype == 'float64':
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10,5))
        ax1.hist(df_covariates[col], bins=20)
        ax1.set_title(col + ' histogram')
        ax2.boxplot(df_covariates[col])
        ax2.set_title(col + ' box plot')
        plt.show()
    elif df_covariates[col].dtype == 'object':
        plt.figure()
        df_covariates[col].value_counts().plot(kind='bar')
        plt.title(col)
        plt.show()

In [None]:
# Convert categorical variables
df_covariates['Sex'] = df_covariates['Sex'].astype('category')
df_covariates['Pacemaker'] = df_covariates['Pacemaker'].astype('category')
df_covariates['Past_tobacco'] = df_covariates['Past_tobacco'].astype('category')
df_covariates['Assessment_center'] = df_covariates['Assessment_center'].astype('category')

# drop collumn 'LAX'
df_covariates = df_covariates.drop(columns=['LAX'])

# Print the data types of each variable
print(df_covariates.dtypes)

In [None]:
# View the categories of the 'sex' column
print(df_covariates['Sex'].cat.categories)
# Check if 'Sex' column is a CategoricalIndex
print(pd.api.types.is_categorical_dtype(df_covariates['Sex']))

### Check for correlation between PCs and covariates

In [None]:
import pandas as pd

# Assuming you have two DataFrames: df1 (covariates) and df2 (PCs)
# Rename the 'IID' column in df2 to 'eid' to match the column name in df1
df_pcs = df_pcs.rename(columns={'IID': 'eid'})

# Merge the two DataFrames on the 'eid' column
merged_df = pd.merge(df_covariates, df_pcs, on='eid')

# Calculate the correlation between the PCs and covariates
# You may customize the list of PCs and covariates as needed
pcs = ['PC_0','PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'PC_6', 'PC_7', 'PC_8', 'PC_9']
covariates = ['Sex', 'Assessment_center', 'Systolic_manual', 'Past_tobacco', 'Pacemaker', 
              'Systolic_automatic', 'EF', 'end_diastolic_volume', 'cardiac_output', 
              'cardiac_index', 'heart_rate', 'BMI', 'Age']

# Get dummy variables for categorical covariates
merged_df = pd.get_dummies(merged_df, columns=['Sex', 'Assessment_center', 'Past_tobacco', 'Pacemaker'])

# Update the covariates list to include dummy variable column names
covariates = [col for col in merged_df.columns if any(cov in col for cov in covariates)]

# Compute the correlation matrix
corr_matrix = merged_df[pcs + covariates].corr()

# Get the correlations between PCs and covariates
pc_covariate_correlations = corr_matrix.loc[pcs, covariates]

print(pc_covariate_correlations)

In [None]:
# Initialize an empty DataFrame to store correlations
pc_covariate_correlations = pd.DataFrame(index=pcs, columns=covariates)

# Compute correlations for each pair of PC and covariate using pairwise deletion
for pc in pcs:
    for covariate in covariates:
        non_missing_mask = (~merged_df[pc].isna()) & (~merged_df[covariate].isna())
        pc_covariate_correlations.loc[pc, covariate] = merged_df.loc[non_missing_mask, [pc, covariate]].corr().iloc[0, 1]

print(pc_covariate_correlations)