In [4]:
1.We extracted data from the “UK Biobank”, identifying 3,290 cases with essential hypertension and co-occurring dementia, after excluding individuals with Parkinson’s disease, vascular dementia, frontotemporal dementia, Huntington’s disease, or Creutzfeldt-Jakob disease. A total of 87,524 individuals with essential hypertension but without any neurodegenerative conditions were included as controls and ran genome-wide association analysis using PLINK.2.0. 
2.APOE status was calculated using https://github.com/neurogenetics/APOE_genotypes?tab=readme-ov-file#1.
3.APOE status or APOE genotype was included in the covariate file.
4.We ran GWAS using PLINK.2.0 and 10 principle components, sex_At_birth, current age (2024-year of birth), age filter > 60 and APOE status. 

In [None]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import shutil

In [5]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'), res.stderr.decode('utf-8'))

# Interacting with Swiss Army Knife through Jupyter Notebooks
## What is Swiss Army Knife?
### Swiss Army Knife is a generic app which can be used to perform common file operations for the genotype data housed on DNANexus.
### Contains software such as plink, plink2, bcftools, etc.
#### Below we'll see an example of looping through each chromosome and extracting a set of variants.
##### -iin: input (need to specify each file).
##### -icmd: command to run
##### --instance-type: VM type to run the command on
##### --destination: output folder
#### Note: this will start 23 separate Swiss Army Knife jobs so make sure you are using a reasonable instance type.

In [6]:
import pandas as pd

In [7]:
!dx download ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt

Error: path "/opt/notebooks/ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_no
PDD.CTRL_inclProxy_60older.plink_pheno.txt" already exists but -f/--overwrite
was not set


In [8]:
df = pd.read_csv("ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt", sep = '\t')

In [9]:
count_1 = df['ADRD'].value_counts().get(1, 0)
count_2 = df['ADRD'].value_counts().get(2, 0)

print(f"Count of 1 in 'Pheno': {count_1}")
print(f"Count of 2 in 'Pheno': {count_2}")

Count of 1 in 'Pheno': 87524
Count of 2 in 'Pheno': 3290


In [10]:
!dx download UKB_EUR_UMAP_COVARIATES.txt

Error: path "/opt/notebooks/UKB_EUR_UMAP_COVARIATES.txt" already exists but
-f/--overwrite was not set


In [11]:
df1 = pd.read_csv("UKB_EUR_UMAP_COVARIATES.txt", sep = '\t')

In [14]:
# Merging the DataFrames on the 'ID' column
final = pd.merge(df, df1, on=['FID','IID'], how='inner')

In [15]:
final.to_csv("Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New.txt", sep='\t', index=False)

In [16]:
!dx upload Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New.txt

ID                                file-J13XGq0JbP2qBpJB8yQPvpQp
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:22:36 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:22:37 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [16]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)

for CHR in $CHR_IDS; do
    dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --snps 19:44908684:T:C,19:44908822:C:T --make-bed \
    --out "$in_prefix".apoe_snps' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 
done 

job-J0q4Kp8JbP2ZKZf9V5F37VY1
job-J0q4KpQJbP2k5k7B5Z8kX34q
job-J0q4Kq0JbP2pQPyYX2XPyZjb
job-J0q4Kq8JbP2XJQ0Zvf2bpj18
job-J0q4KqQJbP2jG3GGgxBzK7gB
job-J0q4Kv0JbP2pQPyYX2XPyZjg
job-J0q4Kv8JbP2pQPyYX2XPyZjk
job-J0q4KvQJbP2pQPyYX2XPyZjq
job-J0q4KvjJbP2Z4qz5fxzjkG6f
job-J0q4Kx8JbP2XJQ0Zvf2bpj1F
job-J0q4KxQJbP2V0VbPkP12vyPp
job-J0q4KxjJbP2fBk9FQYQ20zvG
job-J0q4Ky8JbP2V0VbPkP12vyPy
job-J0q4KyQJbP2v55f99FF2K5Bk
job-J0q4KyjJbP2vY2BvyyXPVz5Z
job-J0q4Kz8JbP2V0VbPkP12vyQ4
job-J0q4KzQJbP2yzY4kykqVKZ4b
job-J0q4KzjJbP2fBk9FQYQ20zvf
job-J0q4P00JbP2vY2BvyyXPVz60
job-J0q4P0QJbP2V0VbPkP12vyQX
job-J0q4P0jJbP2Z4qz5fxzjkG7Q
job-J0q4P10JbP2V0VbPkP12vyQZ
job-J0q4P1QJbP2vY2BvyyXPVz6K


In [None]:
%%bash

CHR=19

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.apoe_snps.bed" \
    -iin="chr${CHR}_pgen.apoe_snps.bim" \
    -iin="chr${CHR}_pgen.apoe_snps.fam" \
    -icmd='plink --bfile "$in_prefix" \
    --allow-no-sex \
    --recodeA \
    --out "$in_prefix".apoe_snps_2' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/"

In [17]:
!dx download chr19_pgen.apoe_snps.apoe_snps.ped



In [18]:
 # Import necessary packages
import numpy as np
import pandas as pd
from functools import reduce

# Define input and output file names directly in the notebook
input_file = 'chr19_pgen.apoe_snps.apoe_snps.ped'  # Replace with the actual path to your .ped file
output_name = 'chr19_pgen.apoe_snps.apoe_snps_out.ped'     # Replace with the desired output file name (without suffix)

# Read in the .ped file and force column names
header_text = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO", "rs429358", "rs7412"]
input_ped_df = pd.read_csv(input_file, sep='\t', header=None, names=header_text)

# Make a combined column by concatenating rs429358 and rs7412 genotypes
input_ped_df['rs429358_rs7412'] = input_ped_df['rs429358'].astype(str) + '_' + input_ped_df['rs7412']

# Initialize a dictionary to map the combined genotypes to APOE genotypes
apoe_genotypes_dict = {
    'CC_TT': 'e1/e1',
    'CT_TT': 'e1/e2',
    'TC_TT': 'e1/e2',
    'CC_CT': 'e1/e4',
    'CC_TC': 'e1/e4',
    'TT_TT': 'e2/e2',
    'TT_TC': 'e2/e3',
    'TT_CT': 'e2/e3',
    'TC_TC': 'e2/e4 or e1/e3',
    'CT_CT': 'e2/e4 or e1/e3',
    'TC_CT': 'e2/e4 or e1/e3',
    'CT_TC': 'e2/e4 or e1/e3',
    'TT_CC': 'e3/e3',
    'TC_CC': 'e3/e4',
    'CT_CC': 'e3/e4',
    'CC_CC': 'e4/e4'
}

# Map the combined genotype column to APOE genotypes
input_ped_df['APOE_GENOTYPE'] = input_ped_df['rs429358_rs7412'].map(apoe_genotypes_dict)

# Handle NaN values by marking unknown genotypes
input_ped_df['APOE_GENOTYPE'].fillna('unknown', inplace=True)

# Create a new DataFrame with the relevant columns
subset_geno_df = input_ped_df[['FID', 'IID', 'SEX', 'PHENO', 'APOE_GENOTYPE']]

## Generate counts and percentages

# Overall APOE genotype counts and percentages
counts_df = pd.DataFrame(subset_geno_df['APOE_GENOTYPE'].value_counts().reset_index())
counts_df.columns = ['APOE_GENOTYPE', 'TOTAL_COUNT']
counts_df['TOTAL_PERCENT'] = (counts_df['TOTAL_COUNT'] / subset_geno_df.shape[0]) * 100

# Separate into cases, controls, and missing phenotype
missing_pheno_df = subset_geno_df[subset_geno_df['PHENO'] == -9]
controls_df = subset_geno_df[subset_geno_df['PHENO'] == 1]
cases_df = subset_geno_df[subset_geno_df['PHENO'] == 2]

# Generate counts and percentages for missing phenotypes
missing_pheno_counts_df = pd.DataFrame(missing_pheno_df['APOE_GENOTYPE'].value_counts().reset_index())
missing_pheno_counts_df.columns = ['APOE_GENOTYPE', 'MISSING_PHENO_COUNT']
missing_pheno_counts_df['MISSING_PHENO_PERCENT'] = (missing_pheno_counts_df['MISSING_PHENO_COUNT'] / missing_pheno_df.shape[0]) * 100

# Generate counts and percentages for controls
controls_counts_df = pd.DataFrame(controls_df['APOE_GENOTYPE'].value_counts().reset_index())
controls_counts_df.columns = ['APOE_GENOTYPE', 'CONTROLS_COUNT']
controls_counts_df['CONTROLS_PERCENT'] = (controls_counts_df['CONTROLS_COUNT'] / controls_df.shape[0]) * 100

# Generate counts and percentages for cases
cases_counts_df = pd.DataFrame(cases_df['APOE_GENOTYPE'].value_counts().reset_index())
cases_counts_df.columns = ['APOE_GENOTYPE', 'CASES_COUNT']
cases_counts_df['CASES_PERCENT'] = (cases_counts_df['CASES_COUNT'] / cases_df.shape[0]) * 100

# Merge the dataframes to get a summary
dataframes_tomerge = [counts_df, missing_pheno_counts_df, controls_counts_df, cases_counts_df]
merged_summary_df = reduce(lambda left, right: pd.merge(left, right, on='APOE_GENOTYPE', how='outer'), dataframes_tomerge)

## Export results
complete_df_output = output_name + ".APOE_GENOTYPES.csv"
counts_df_output = output_name + ".APOE_SUMMARY.csv"

# Save the complete genotype file as .csv
subset_geno_df.to_csv(complete_df_output, index=False)
print(f"Your complete genotype file has been saved here: {complete_df_output}")

# Save the summary counts file as .csv
merged_summary_df.to_csv(counts_df_output, index=False)
print(f"The summary counts have been saved here: {counts_df_output}")

# Done!
print("Script execution complete!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  input_ped_df['APOE_GENOTYPE'].fillna('unknown', inplace=True)


Your complete genotype file has been saved here: chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_GENOTYPES.csv
The summary counts have been saved here: chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv
Script execution complete!


In [19]:
subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({
    'e4/e4': 2,
    'e3/e4': 1
}).map({2: 2, 1: 1}).fillna(0).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({


In [20]:
subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({
    'e4/e4': 2,
    'e3/e4': 1
}).map({2: 2, 1: 1}).fillna(0).astype(int)

# Display the updated dataframe
subset_geno_df.to_csv("APOE_Status.txt", sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({


In [21]:
!dx upload APOE_Status.txt

ID                                file-J13XGy8JbP2qBpJB8yQPvpV8
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              APOE_Status.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:22:49 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:22:50 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [22]:
!dx upload chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_GENOTYPES.csv

ID                                file-J13XGz0JbP2ZvK40VyKVyzx3
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_GENOTYPES.csv
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:22:52 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:22:53 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [23]:
!dx upload chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv

ID                                file-J13XGzQJbP2ZvK40VyKVyzx5
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:22:54 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:22:55 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [24]:
!head chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv

APOE_GENOTYPE,TOTAL_COUNT,TOTAL_PERCENT,MISSING_PHENO_COUNT,MISSING_PHENO_PERCENT,CONTROLS_COUNT,CONTROLS_PERCENT,CASES_COUNT,CASES_PERCENT
e2/e2,3125,0.6413163711138793,3125,0.6413163711138793,,,,
e2/e3,59312,12.172082113122052,59312,12.172082113122052,,,,
e2/e4 or e1/e3,12132,2.4897440685931467,12132,2.4897440685931467,,,,
e3/e3,286304,58.755661540924194,286304,58.755661540924194,,,,
e3/e4,114542,23.50645112964031,114542,23.50645112964031,,,,
e4/e4,11583,2.3770776085158607,11583,2.3770776085158607,,,,
unknown,281,0.05766716809056003,281,0.05766716809056003,,,,


In [25]:
!dx download ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt

Error: path "/opt/notebooks/ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_no
PDD.CTRL_inclProxy_60older.plink_pheno.txt" already exists but -f/--overwrite
was not set


In [26]:
import pandas as pd

# Read the two files
file1 = pd.read_csv('APOE_Status.txt', sep='\t')
file2 = pd.read_csv('ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt', sep='\t')

# Merge the files on the common 'FID' and 'IID' columns
merged_df = pd.merge(file1, file2, on=['FID', 'IID'], how='inner')

merged_df.to_csv("HT_Alzheimers_APOE_Status.txt", sep='\t', index=False)


In [27]:
!dx download UKB_EUR_UMAP_COVARIATES.txt

Error: path "/opt/notebooks/UKB_EUR_UMAP_COVARIATES.txt" already exists but
-f/--overwrite was not set


In [28]:
import pandas as pd
# Read the two files
file11 = pd.read_csv('HT_Alzheimers_APOE_Status.txt', sep='\t')
file21 = pd.read_csv('UKB_EUR_UMAP_COVARIATES.txt', sep='\t')

# Merge the files on the common 'FID' and 'IID' columns
merged_df1 = pd.merge(file11, file21, on=['FID', 'IID'], how='inner')

merged_df1.to_csv("HT_Alzheimers_APOE_Status_with_PCs.txt", sep='\t', index=False)

In [29]:
# Step 2: Specify the columns you want to extract
columns_to_extract = ['FID', 'IID', 'APOE_GENOTYPE','PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','GENETIC_SEX','AGE_2024_COV']  # Replace with actual column names

# Step 3: Extract the specified columns
df_subset1 = merged_df1[columns_to_extract]

# Step 4: Save the extracted columns to a new file if needed
df_subset1.to_csv('Covar_HT_Alzheimers_APOE_Status_with_PCs.txt', sep='\t', index=False)

In [30]:
# Step 2: Specify the columns you want to extract
columns_to_extract = ['FID', 'IID', 'ADRD']  # Replace with actual column names

# Step 3: Extract the specified columns
df_subset = merged_df1[columns_to_extract]

# Step 4: Save the extracted columns to a new file if needed
df_subset.to_csv('Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt', sep='\t', index=False)

In [31]:
!dx upload Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-J13XJ1jJbP2pJGJf4B7QJb3K
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:23:03 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:23:04 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [32]:
!dx upload Covar_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-J13XJ2QJbP2pJGJf4B7QJb3X
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Covar_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:23:06 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:23:06 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [34]:
!dx upload HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-J13XK2QJbP2VkVF7YY9fZjYy
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:25:14 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:25:14 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [43]:
# Use this if the file is CSV
df = pd.read_csv('HT_Alzheimers_APOE_Status_with_PCs.txt', sep='\t', on_bad_lines='skip')

In [44]:
df.to_csv("HT_Alzheimers_APOE_Status_with_PCs_1.txt", sep='\t', index=False)

In [46]:
# Count unique values for each column
count_a = df['ADRD'].value_counts()
count_b = df['ADRD'].value_counts()

print(count_a)
print(count_b)

ADRD
1    87524
2     3290
Name: count, dtype: int64
ADRD
1    87524
2     3290
Name: count, dtype: int64


In [47]:
!dx upload HT_Alzheimers_APOE_Status_with_PCs_1.txt

ID                                file-J13XPZ8JbP2pqfq6B3f9Fjj6
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Alzheimers_APOE_Status_with_PCs_1.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:28:33 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:28:34 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [48]:
!dx upload HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-J13XPZjJbP2Q2kP0fb5GY44Y
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:28:35 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:28:36 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [50]:
import pandas as pd

# Assuming the dataframe is named 'df'
# Example: df = pd.read_csv('HT_Alzheimers_APOE_Status_with_PCs.txt', delim_whitespace=True)

# Extract the FID and IID columns
fid_iid_df = merged_df1[['FID', 'IID']]
fid_iid_df.to_csv("FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt", sep='\t', index=False)

In [51]:
!dx upload FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-J13XPg8JbP2Q2kP0fb5GY44z
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Jun  9 18:28:45 2025
Created by                        vidhu
 via the job                      job-J13QB2jJbP2x0Pz725v7kPj7
Last modified                     Mon Jun  9 18:28:46 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [None]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)


for CHR in $CHR_IDS; do
    dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -iin="Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="Covar_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt --maf 0.05 \
    --pheno-name ADRD --covar-variance-standardize \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Covar_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,GENETIC_SEX,AGE_2024_COV,APOE_GENOTYPE \
    --keep FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --out "$in_prefix".Pheno_New_final_HT_with_APOE_Status_New' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 
done 