In [1]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import shutil

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'), res.stderr.decode('utf-8'))

# Interacting with Swiss Army Knife through Jupyter Notebooks
## What is Swiss Army Knife?
### Swiss Army Knife is a generic app which can be used to perform common file operations for the genotype data housed on DNANexus.
### Contains software such as plink, plink2, bcftools, etc.
#### Below we'll see an example of looping through each chromosome and extracting a set of variants.
##### -iin: input (need to specify each file).
##### -icmd: command to run
##### --instance-type: VM type to run the command on
##### --destination: output folder
#### Note: this will start 23 separate Swiss Army Knife jobs so make sure you are using a reasonable instance type.

In [3]:
import pandas as pd

In [4]:
!dx download ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt



In [5]:
df = pd.read_csv("ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt", sep = '\t')

In [6]:
count_1 = df['ADRD'].value_counts().get(1, 0)
count_2 = df['ADRD'].value_counts().get(2, 0)

print(f"Count of 1 in 'Pheno': {count_1}")
print(f"Count of 2 in 'Pheno': {count_2}")

Count of 1 in 'Pheno': 87524
Count of 2 in 'Pheno': 3290


In [7]:
!dx download UKB_EUR_UMAP_COVARIATES.txt



In [8]:
df1 = pd.read_csv("UKB_EUR_UMAP_COVARIATES.txt", sep = '\t')

In [9]:
!dx download Systolic_blood_pressure_participant.csv



In [10]:
df2 = pd.read_csv("Systolic_blood_pressure_participant.csv")

In [11]:
# Rename column headers
df4 = df2.rename(columns={'eid': 'FID', 'p4080_i0_a0': 'sbp'})

In [12]:
# assuming df is your DataFrame
df4['IID'] = df4['FID'].copy()

In [13]:
# Rearrange column sequence
df4 = df4[['FID', 'IID', 'sbp']]

In [14]:
# Merging the DataFrames on the 'ID' column
final = pd.merge(df1, df4, on=['FID','IID'], how='inner')

In [15]:
# Merging the DataFrames on the 'ID' column
final1 = pd.merge(final, df, on=['FID','IID'], how='inner')

In [16]:
final1.to_csv("Covar_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt", sep='\t', index=False)

In [17]:
# Extract only 'FID' and 'IID' column values
final2 = final1[['FID', 'IID']]

In [18]:
final2.to_csv("FID_IID_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt", sep='\t', index=False)

In [19]:
# Merging the DataFrames on the 'ID' column
final4 = pd.merge(final2, df, on=['FID','IID'], how='inner')

In [20]:
count_1 = final4['ADRD'].value_counts().get(1, 0)
count_2 = final4['ADRD'].value_counts().get(2, 0)

print(f"Count of 1 in 'Pheno': {count_1}")
print(f"Count of 2 in 'Pheno': {count_2}")

Count of 1 in 'Pheno': 83205
Count of 2 in 'Pheno': 3105


In [21]:
df.to_csv("Pheno_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt", sep='\t', index=False)

In [35]:
!dx upload Covar_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt

dxpy.utils.resolver.ResolutionError: Unable to resolve "Covar_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt" to a data object or folder name in '/'


In [31]:
!dx upload FID_IID_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt

FileNotFoundError: [Errno 2] No such file or directory: 'FID_IID_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt'


In [32]:
!dx upload Pheno_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt

ID                                file-J11Y7GQJbP2X5Zjz1FF5PP8K
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Pheno_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Fri Jun  6 18:30:10 2025
Created by                        vidhu
 via the job                      job-J11X038JbP2qK379bzxygGGB
Last modified                     Fri Jun  6 18:30:11 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [53]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)


for CHR in $CHR_IDS; do
    dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -iin="Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt" \
    -iin="Covar_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt" \
    -iin="FID_IID_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt" \
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno Pheno_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt --maf 0.01 \
    --pheno-name ADRD --covar-variance-standardize \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Covar_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt \
    --covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,GENETIC_SEX,AGE_2024_COV,sbp \
    --keep FID_IID_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt \
    --out "$in_prefix".Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 
done 

job-Gxfvk90JbP2qkqKPv2qkKyyV
job-Gxfvk98JbP2Z91v1b7zfvzKb
job-Gxfvk9QJbP2fG2QQQgfV8kpG
job-Gxfvk9jJbP2Qb2ypGxgj98QG
job-GxfvkB0JbP2fG2QQQgfV8kpQ
job-GxfvkB8JbP2qkqKPv2qkKyyb
job-GxfvkBQJbP2Z91v1b7zfvzKj
job-GxfvkBjJbP2Z91v1b7zfvzKp
job-GxfvkF0JbP2pG9qPGBgPxb44
job-GxfvkF8JbP2fG2QQQgfV8kqQ
job-GxfvkFQJbP2Qb2ypGxgj98QQ
job-GxfvkFjJbP2pG9qPGBgPxb48
job-GxfvkG0JbP2fG2QQQgfV8kqX
job-GxfvkG8JbP2ZZ9jJv5VV29JX
job-GxfvkGjJbP2ZZ9jJv5VV29JZ
job-GxfvkJ0JbP2qzPkKPJ6p9GXy
job-GxfvkJ8JbP2k8Jf4QkVvK3qQ
job-GxfvkJQJbP2qzPkKPJ6p9GY6
job-GxfvkJjJbP2ZZ9jJv5VV29KB
job-GxfvkK0JbP2fG2QQQgfV8kv7
job-GxfvkK8JbP2qzPkKPJ6p9GYB
job-GxfvkKQJbP2k8Jf4QkVvK3qv
job-GxfvkKjJbP2Qb2ypGxgj98V5


In [35]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)


for CHR in $CHR_IDS; do
    dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -iin="Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="Covar_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt --maf 0.05 \
    --pheno-name ADRD --covar-variance-standardize \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Covar_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,GENETIC_SEX,AGE_2024_COV,APOE_GENOTYPE \
    --keep FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --out "$in_prefix".Pheno_New_final_HT_with_APOE_Status_New' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 
done 

job-GvJ6ZBQJbP2k41kp525BJ4JY
job-GvJ6ZBjJbP2ZFjBQGpfK8QGg
job-GvJ6ZF0JbP2x2gbzKx43x32J
job-GvJ6ZF8JbP2jZbkF1K3Pv0xZ
job-GvJ6ZFQJbP2VzqP7vv41F2QP
job-GvJ6ZFjJbP2k41kp525BJ4Jp
job-GvJ6ZG0JbP2x2gbzKx43x32g
job-GvJ6ZG8JbP2jZbkF1K3Pv0xv
job-GvJ6ZGQJbP2jZbkF1K3Pv0y0
job-GvJ6ZGjJbP2jZbkF1K3Pv0y4
job-GvJ6ZJ0JbP2jZbkF1K3Pv0y6
job-GvJ6ZJ8JbP2k41kp525BJ4K2
job-GvJ6ZJQJbP2ZFjBQGpfK8QJF
job-GvJ6ZJjJbP2jZbkF1K3Pv0yP
job-GvJ6ZK0JbP2k41kp525BJ4KG
job-GvJ6ZK8JbP2k41kp525BJ4KQ
job-GvJ6ZKQJbP2ZFjBQGpfK8QJk
job-GvJ6ZKjJbP2jZbkF1K3Pv0yg
job-GvJ6ZP0JbP2x2gbzKx43x339
job-GvJ6ZP8JbP2VzqP7vv41F2X1
job-GvJ6ZPQJbP2zg51ZbkFP04GK
job-GvJ6ZPjJbP2k41kp525BJ4P1
job-GvJ6ZQ0JbP2jZbkF1K3Pv0z1


In [3]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)

CHR=19

dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --chr 19 \
    --from-bp 45000000 --to-bp 45500000 \
    --make-bed \
    --out "$in_prefix".APOE_LD_region' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 

job-Gx8pYp8JbP2XZ6B8qyz7VkG5


In [8]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)

CHR=19

dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --ld rs429358 rs7412 \
    --out "$in_prefix".APOE_LD_results' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/"

job-Gx8qp5jJbP2v1V8Vkpzj3Xg4


In [7]:
%%bash

seq 1 1 23 > chr_list.txt

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

CHR_IDS=$(cat chr_list.txt)

for CHR in $CHR_IDS; do
    dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --snps 19:44908684:T:C,19:44908822:C:T --make-bed \
    --out "$in_prefix".apoe_snps' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 
done 

job-Gv7x9XQJbP2zp8bfjxZ66gb2
job-Gv7x9XjJbP2QbXGyJJ3Kj73g
job-Gv7x9Y0JbP2QFy0BjyY8YpzB
job-Gv7x9Y8JbP2QbXGyJJ3Kj73q
job-Gv7x9YQJbP2f3JZ7Qfq71JYQ
job-Gv7x9YjJbP2zp8bfjxZ66gbK
job-Gv7x9Z0JbP2zp8bfjxZ66gbQ
job-Gv7x9Z8JbP2jVx8fjKpKbq2Y
job-Gv7x9ZQJbP2QbXGyJJ3Kj73x
job-Gv7x9ZjJbP2zp8bfjxZ66gbZ
job-Gv7x9b0JbP2jVx8fjKpKbq2k
job-Gv7x9b8JbP2QbXGyJJ3Kj743
job-Gv7x9bQJbP2jVx8fjKpKbq2z
job-Gv7x9bQJbP2vZGPBQF9906vV
job-Gv7x9bjJbP2f3JZ7Qfq71JYx
job-Gv7x9f0JbP2f7j70VB6Gx9Vj
job-Gv7x9f8JbP2QbXGyJJ3Kj74g
job-Gv7x9fQJbP2f3JZ7Qfq71JZ7
job-Gv7x9fjJbP2zp8bfjxZ66gby
job-Gv7x9g0JbP2QFy0BjyY8Ypzp
job-Gv7x9g8JbP2vZGPBQF9906vj
job-Gv7x9gQJbP2vZGPBQF9906vv
job-Gv7x9gjJbP2f7j70VB6GxB9y


In [4]:
!dx download chr1_pgen.pvar



In [5]:
!head chr1_pgen.pvar

#CHROM	POS	ID	REF	ALT
1	10500	1:10500:G:T	G	T
1	10598	1:10598:G:A	G	A
1	10599	1:10599:C:G	C	G
1	10612	1:10612:A:C	A	C
1	10894	1:10894:G:A	G	A
1	10915	1:10915:G:A	G	A
1	10930	1:10930:G:A	G	A
1	10989	1:10989:G:A	G	A
1	11171	1:11171:CCTTG:C	CCTTG	C


In [1]:
%%bash

CHR=2

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"

dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --snps 2:36408804:T:C --make-bed \
    --out "$in_prefix".crim1_snps' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 


job-GvGQ640JbP2gjBZ6Y9q5F3Qx


In [1]:
%%bash

CHR=2

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.crim1_snps.bed" \
    -iin="chr${CHR}_pgen.crim1_snps.bim" \
    -iin="chr${CHR}_pgen.crim1_snps.fam" \
    -icmd='plink --bfile "$in_prefix" \
    --allow-no-sex \
    --recodeA \
    --out "$in_prefix".apoe_snps_1' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/"

job-GvGYfzQJbP2VxbvVQyp58B37


In [3]:
!dx download chr2_pgen.crim1_snps.apoe_snps_1.raw



In [172]:
%%bash

CHR=19

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="$chr${CHR}_pgen.pgen" \
    -iin="$chr${CHR}_pgen.psam" \
    -iin="$chr${CHR}_pgen.pvar" \
    -iin="DEM_I10.plink.txt" \
    -iin="UKB_EUR_UMAP_COVARIATES.txt"\
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno-name Pheno --covar-variance-standardize \
    --pheno Pheno_HT_Dem_Vs_HT_TOWNSEND_remove_ADPD_new.txt --maf 0.05 \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Pheno_HT_Dem_Vs_HT_TOWNSEND_remove_ADPD_new.txt \
    --covar-name GENETIC_SEX,AGE_OF_RECRUIT,TOWNSEND,PC1,PC2,PC3,PC4,PC5 \
    --keep Pheno_HT_Dem_Vs_HT_TOWNSEND_remove_ADPD_new.txt \
    --out "$in_prefix".HT+DEM_VS_HT_TOWNSEND_remove_ADPD_new' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 


job-GkQy25jJgKp5jZkgZBpFF14z


In [46]:
%%bash

CHR=19

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -iin="Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="Covar_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno-name ADRD --covar-variance-standardize \
    --pheno Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt --maf 0.05 \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Covar_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,GENETIC_SEX,AGE_2024_COV,APOE_GENOTYPE \
    --keep FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --out "$in_prefix".Pheno_New_final_HT_with_APOE_Status_New' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 

job-GvJ94Z0JbP2XGqyqBP9Fq17Y


In [54]:
%%bash

CHR=20

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -iin="Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="Covar_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -iin="FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt" \
    -icmd='plink2 --pfile "$in_prefix" \
    --glm hide-covar firth-fallback cols=+a1freq,+a1freqcc,+a1countcc,+totallelecc,+err,+beta,-test,-nobs,-tz,-orbeta \
    --pheno-name ADRD --covar-variance-standardize \
    --pheno Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt --maf 0.05 \
    --geno 0.05 --mac 20  --memory 12000 \
    --covar Covar_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,GENETIC_SEX,AGE_2024_COV,APOE_GENOTYPE \
    --keep FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt \
    --out "$in_prefix".Pheno_New_final_HT_with_APOE_Status_New' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 

job-GvJ9KvjJbP2x2gbzKx442KXP


In [6]:
%%bash

CHR=19

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.pgen" \
    -iin="chr${CHR}_pgen.psam" \
    -iin="chr${CHR}_pgen.pvar" \
    -icmd='plink2 --pfile "$in_prefix" \
    --snps 19:44908684:T:C,19:44908822:C:T --make-bed \
    --out "$in_prefix".apoe_snps' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/" 

job-Gv7x2ZjJbP2Z9b1XYY489P95


In [12]:
%%bash

CHR=19

FILE_DIR_A="/Imputed_Genotypes_2023/TOPMed_Plink2"
FILE_DIR_B="/ref_panel/chip_overlaps/"
# FILE_DIR_C="/Imputed_Genotypes_2023/TOPMed_Plink2/extract"
dx run swiss-army-knife -iin="chr${CHR}_pgen.apoe_snps.bed" \
    -iin="chr${CHR}_pgen.apoe_snps.bim" \
    -iin="chr${CHR}_pgen.apoe_snps.fam" \
    -icmd='plink2 --bfile "$in_prefix" \
    --recode compound-genotypes \
    --out "$in_prefix".apoe_snps' -y  --brief --priority normal \
    --instance-type mem3_ssd3_x12 --destination "${projectid}:${FILE_DIR}/"

job-Gv7y30QJbP2ZV68ypFK6G93j


In [103]:
!dx download chr19_pgen.apoe_snps.apoe_snps.ped

dxpy.utils.resolver.ResolutionError: Unable to resolve "chr19_pgen.apoe_snps.apoe_snps.ped" to a data object or folder name in '/'


In [104]:
import pandas as pd

# Define the header row
headers = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO", "rs429358", "rs7412"]

# Specify the input .ped file and output .ped file paths
input_file = 'chr19_pgen.apoe_snps.apoe_snps.ped'
output_file = 'out_chr19_pgen.apoe_snps.apoe_snps.ped'

# Load the .ped file into a pandas DataFrame (assuming it's space-delimited)
df = pd.read_csv(input_file, delim_whitespace=True, header=None)

# Add the headers to the DataFrame
df.columns = headers

# Save the DataFrame to a new .ped file with the headers included (space-delimited)
df.to_csv(output_file, sep='\t', index=False)

print(f"Headers added successfully to {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'chr19_pgen.apoe_snps.apoe_snps.ped'

In [None]:
!dx upload out_chr19_pgen.apoe_snps.apoe_snps.ped

In [None]:
 # Import necessary packages
import numpy as np
import pandas as pd
from functools import reduce

# Define input and output file names directly in the notebook
input_file = 'chr19_pgen.apoe_snps.apoe_snps.ped'  # Replace with the actual path to your .ped file
output_name = 'chr19_pgen.apoe_snps.apoe_snps_out.ped'     # Replace with the desired output file name (without suffix)

# Read in the .ped file and force column names
header_text = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO", "rs429358", "rs7412"]
input_ped_df = pd.read_csv(input_file, sep='\t', header=None, names=header_text)

# Make a combined column by concatenating rs429358 and rs7412 genotypes
input_ped_df['rs429358_rs7412'] = input_ped_df['rs429358'].astype(str) + '_' + input_ped_df['rs7412']

# Initialize a dictionary to map the combined genotypes to APOE genotypes
apoe_genotypes_dict = {
    'CC_TT': 'e1/e1',
    'CT_TT': 'e1/e2',
    'TC_TT': 'e1/e2',
    'CC_CT': 'e1/e4',
    'CC_TC': 'e1/e4',
    'TT_TT': 'e2/e2',
    'TT_TC': 'e2/e3',
    'TT_CT': 'e2/e3',
    'TC_TC': 'e2/e4 or e1/e3',
    'CT_CT': 'e2/e4 or e1/e3',
    'TC_CT': 'e2/e4 or e1/e3',
    'CT_TC': 'e2/e4 or e1/e3',
    'TT_CC': 'e3/e3',
    'TC_CC': 'e3/e4',
    'CT_CC': 'e3/e4',
    'CC_CC': 'e4/e4'
}

# Map the combined genotype column to APOE genotypes
input_ped_df['APOE_GENOTYPE'] = input_ped_df['rs429358_rs7412'].map(apoe_genotypes_dict)

# Handle NaN values by marking unknown genotypes
input_ped_df['APOE_GENOTYPE'].fillna('unknown', inplace=True)

# Create a new DataFrame with the relevant columns
subset_geno_df = input_ped_df[['FID', 'IID', 'SEX', 'PHENO', 'APOE_GENOTYPE']]

## Generate counts and percentages

# Overall APOE genotype counts and percentages
counts_df = pd.DataFrame(subset_geno_df['APOE_GENOTYPE'].value_counts().reset_index())
counts_df.columns = ['APOE_GENOTYPE', 'TOTAL_COUNT']
counts_df['TOTAL_PERCENT'] = (counts_df['TOTAL_COUNT'] / subset_geno_df.shape[0]) * 100

# Separate into cases, controls, and missing phenotype
missing_pheno_df = subset_geno_df[subset_geno_df['PHENO'] == -9]
controls_df = subset_geno_df[subset_geno_df['PHENO'] == 1]
cases_df = subset_geno_df[subset_geno_df['PHENO'] == 2]

# Generate counts and percentages for missing phenotypes
missing_pheno_counts_df = pd.DataFrame(missing_pheno_df['APOE_GENOTYPE'].value_counts().reset_index())
missing_pheno_counts_df.columns = ['APOE_GENOTYPE', 'MISSING_PHENO_COUNT']
missing_pheno_counts_df['MISSING_PHENO_PERCENT'] = (missing_pheno_counts_df['MISSING_PHENO_COUNT'] / missing_pheno_df.shape[0]) * 100

# Generate counts and percentages for controls
controls_counts_df = pd.DataFrame(controls_df['APOE_GENOTYPE'].value_counts().reset_index())
controls_counts_df.columns = ['APOE_GENOTYPE', 'CONTROLS_COUNT']
controls_counts_df['CONTROLS_PERCENT'] = (controls_counts_df['CONTROLS_COUNT'] / controls_df.shape[0]) * 100

# Generate counts and percentages for cases
cases_counts_df = pd.DataFrame(cases_df['APOE_GENOTYPE'].value_counts().reset_index())
cases_counts_df.columns = ['APOE_GENOTYPE', 'CASES_COUNT']
cases_counts_df['CASES_PERCENT'] = (cases_counts_df['CASES_COUNT'] / cases_df.shape[0]) * 100

# Merge the dataframes to get a summary
dataframes_tomerge = [counts_df, missing_pheno_counts_df, controls_counts_df, cases_counts_df]
merged_summary_df = reduce(lambda left, right: pd.merge(left, right, on='APOE_GENOTYPE', how='outer'), dataframes_tomerge)

## Export results
complete_df_output = output_name + ".APOE_GENOTYPES.csv"
counts_df_output = output_name + ".APOE_SUMMARY.csv"

# Save the complete genotype file as .csv
subset_geno_df.to_csv(complete_df_output, index=False)
print(f"Your complete genotype file has been saved here: {complete_df_output}")

# Save the summary counts file as .csv
merged_summary_df.to_csv(counts_df_output, index=False)
print(f"The summary counts have been saved here: {counts_df_output}")

# Done!
print("Script execution complete!")


In [105]:
subset_geno_df.head()

NameError: name 'subset_geno_df' is not defined

In [106]:
subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({
    'e4/e4': 2,
    'e3/e4': 1
}).map({2: 2, 1: 1}).fillna(0).astype(int)

# Display the updated dataframe
print(subset_geno_df[['FID', 'IID', 'APOE_GENOTYPE']].head())

NameError: name 'subset_geno_df' is not defined

In [107]:
print(subset_geno_df)

NameError: name 'subset_geno_df' is not defined

In [11]:
subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({
    'e4/e4': 2,
    'e3/e4': 1
}).map({2: 2, 1: 1}).fillna(0).astype(int)

# Display the updated dataframe
print(subset_geno_df[['FID', 'IID', 'APOE_GENOTYPE']].head())
subset_geno_df.to_csv("APOE_Status.txt", sep='\t', index=False)

       FID      IID  APOE_GENOTYPE
0  1194431  1194431              0
1  3658697  3658697              0
2  5547865  5547865              0
3  1063796  1063796              1
4  4779244  4779244              0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_geno_df['APOE_GENOTYPE'] = subset_geno_df['APOE_GENOTYPE'].replace({


In [12]:
!dx upload APOE_Status.txt

ID                                file-GvJ4k40JbP2Y94jBvF66jf7B
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              APOE_Status.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:04 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:05 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [13]:
!dx upload chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_GENOTYPES.csv

ID                                file-GvJ4k4QJbP2QJQZ0vbqF77fx
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_GENOTYPES.csv
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:06 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:07 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [14]:
!dx upload chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv

ID                                file-GvJ4k50JbP2jzpfXb2K4KVB3
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:08 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:09 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [15]:
!head chr19_pgen.apoe_snps.apoe_snps_out.ped.APOE_SUMMARY.csv

APOE_GENOTYPE,TOTAL_COUNT,TOTAL_PERCENT,MISSING_PHENO_COUNT,MISSING_PHENO_PERCENT,CONTROLS_COUNT,CONTROLS_PERCENT,CASES_COUNT,CASES_PERCENT
e3/e3,286304,58.755661540924194,286304,58.755661540924194,,,,
e3/e4,114542,23.50645112964031,114542,23.50645112964031,,,,
e2/e3,59312,12.172082113122052,59312,12.172082113122052,,,,
e2/e4 or e1/e3,12132,2.4897440685931467,12132,2.4897440685931467,,,,
e4/e4,11583,2.3770776085158607,11583,2.3770776085158607,,,,
e2/e2,3125,0.6413163711138793,3125,0.6413163711138793,,,,
unknown,281,0.05766716809056003,281,0.05766716809056003,,,,


In [16]:
!dx download ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt



In [17]:
import pandas as pd

# Read the two files
file1 = pd.read_csv('APOE_Status.txt', sep='\t')
file2 = pd.read_csv('ADRD_noFTD_noPARKINSONISM_noALS_noVD_noHT_noCJD_noPDD.CTRL_inclProxy_60older.plink_pheno.txt', sep='\t')

# Merge the files on the common 'FID' and 'IID' columns
merged_df = pd.merge(file1, file2, on=['FID', 'IID'], how='inner')

# Display the merged dataframe
print(merged_df.head())

merged_df.to_csv("HT_Alzheimers_APOE_Status.txt", sep='\t', index=False)


       FID      IID  SEX  PHENO  APOE_GENOTYPE  ADRD
0  1194431  1194431    1     -9              0     1
1  5547865  5547865    1     -9              0     1
2  1382977  1382977    1     -9              0     1
3  5209497  5209497    2     -9              0     1
4  4911647  4911647    2     -9              0     1


In [18]:
!dx download UKB_EUR_UMAP_COVARIATES.txt



In [19]:
import pandas as pd
# Read the two files
file11 = pd.read_csv('HT_Alzheimers_APOE_Status.txt', sep='\t')
file21 = pd.read_csv('UKB_EUR_UMAP_COVARIATES.txt', sep='\t')

# Merge the files on the common 'FID' and 'IID' columns
merged_df1 = pd.merge(file11, file21, on=['FID', 'IID'], how='inner')

# Display the merged dataframe
print(merged_df1.head())

merged_df1.to_csv("HT_Alzheimers_APOE_Status_with_PCs.txt", sep='\t', index=False)

       FID      IID  SEX  PHENO  APOE_GENOTYPE  ADRD  TOWNSEND GENETIC_SEX  \
0  1194431  1194431    1     -9              0     1      2.90        Male   
1  5547865  5547865    1     -9              0     1     -3.12        Male   
2  1382977  1382977    1     -9              0     1      1.53        Male   
3  5209497  5209497    2     -9              0     1     -4.52      Female   
4  4911647  4911647    2     -9              0     1     -4.08      Female   

   ARRAY  AGE_OF_RECRUIT  ...       PC1       PC2       PC3       PC4  \
0  Axiom              52  ... -0.000347  0.000093 -0.000733 -0.000249   
1  Axiom              64  ...  0.000870 -0.001548  0.000196  0.000373   
2  Axiom              67  ...  0.001523 -0.000267 -0.001473 -0.001191   
3  Axiom              57  ...  0.000312 -0.000684 -0.000755  0.000198   
4  Axiom              63  ...  0.002434 -0.003084  0.001510 -0.001225   

        PC5       PC6       PC7       PC8       PC9      PC10  
0 -0.001082 -0.001730 -0.000

In [20]:
# Step 2: Specify the columns you want to extract
columns_to_extract = ['FID', 'IID', 'APOE_GENOTYPE','PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','GENETIC_SEX','AGE_2024_COV']  # Replace with actual column names

# Step 3: Extract the specified columns
df_subset1 = merged_df1[columns_to_extract]

# Step 4: Save the extracted columns to a new file if needed
df_subset1.to_csv('Covar_HT_Alzheimers_APOE_Status_with_PCs.txt', sep='\t', index=False)

In [21]:
# Step 2: Specify the columns you want to extract
columns_to_extract = ['FID', 'IID', 'ADRD']  # Replace with actual column names

# Step 3: Extract the specified columns
df_subset = merged_df1[columns_to_extract]

# Step 4: Save the extracted columns to a new file if needed
df_subset.to_csv('Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt', sep='\t', index=False)

In [22]:
!dx upload Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-GvJ4k7QJbP2jzpfXb2K4KVBQ
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Pheno_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:18 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:18 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [23]:
!dx upload Covar_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-GvJ4k7jJbP2jx82QJY2Bj8qg
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              Covar_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:19 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:20 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [4]:
# Use this if the file is CSV
df = pd.read_csv('HT_Alzheimers_APOE_Status_with_PCs.txt', delimiter="\t", error_bad_lines=False)



  df = pd.read_csv('HT_Alzheimers_APOE_Status_with_PCs.txt', delimiter="\t", error_bad_lines=False)


FileNotFoundError: [Errno 2] No such file or directory: 'HT_Alzheimers_APOE_Status_with_PCs.txt'

In [3]:
df.to_csv("HT_Alzheimers_APOE_Status_with_PCs_1.txt", sep='\t', index=False)

NameError: name 'df' is not defined

In [None]:
# Count unique values for each column
count_a = df['A'].value_counts()
count_b = df['B'].value_counts()

print(count_a)
print(count_b)

In [26]:
!dx upload HT_Alzheimers_APOE_Status_with_PCs_1.txt

ID                                file-GvJ4k8QJbP2Y94jBvF66jf7P
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Alzheimers_APOE_Status_with_PCs_1.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:22 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:23 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [27]:
!dx upload HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-GvJ4k90JbP2fgGYy1K83qJB3
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:24 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:25 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [28]:
import pandas as pd

# Assuming the dataframe is named 'df'
# Example: df = pd.read_csv('HT_Alzheimers_APOE_Status_with_PCs.txt', delim_whitespace=True)

# Extract the FID and IID columns
fid_iid_df = merged_df1[['FID', 'IID']]

# Display the extracted columns
print(fid_iid_df)
fid_iid_df.to_csv("FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt", sep='\t', index=False)

           FID      IID
0      1194431  1194431
1      5547865  5547865
2      1382977  1382977
3      5209497  5209497
4      4911647  4911647
...        ...      ...
90809  6018335  6018335
90810  1434909  1434909
90811  4303056  4303056
90812  1547256  1547256
90813  5294387  5294387

[90814 rows x 2 columns]


In [29]:
!dx upload FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt

ID                                file-GvJ4k9QJbP2jx82QJY2Bj8qk
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              FID_IID_HT_Alzheimers_APOE_Status_with_PCs.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Thu Oct 24 13:33:26 2024
Created by                        vidhu
 via the job                      job-GvJ44y0JbP2xg8kByKY6vPfB
Last modified                     Thu Oct 24 13:33:27 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [30]:
!dx pwd

REGARDS_New:/


In [31]:
!head chr19_pgen.psam

head: cannot open 'chr19_pgen.psam' for reading: No such file or directory


In [6]:
!dx download chr19_pgen.psam



In [None]:
chr22_pgen.Pheno_New_final_Diabetes_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_Diabetes_1.ADRD.glm.logistic.hybrid

In [1]:
!dx download chr1_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [2]:
!dx download chr2_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [3]:
!dx download chr3_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [4]:
!dx download chr4_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [5]:
!dx download chr5_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [6]:
!dx download chr6_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [7]:
!dx download chr7_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [8]:
!dx download chr8_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [9]:
!dx download chr9_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [10]:
!dx download chr10_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [11]:
!dx download chr11_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [12]:
!dx download chr12_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [13]:
!dx download chr13_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [14]:
!dx download chr14_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [15]:
!dx download chr15_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [16]:
!dx download chr16_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [17]:
!dx download chr17_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [18]:
!dx download chr18_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [19]:
!dx download chr19_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [20]:
!dx download chr20_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [21]:
!dx download chr21_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [22]:
!dx download chr22_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid



In [33]:
!dx download

usage: dx download [-h] [--env-help] [-o OUTPUT] [-f] [-r] [-a]
                   [--no-progress] [--lightweight]
                   [--symlink-max-tries SYMLINK_MAX_TRIES] [--unicode]
                   path [path ...]

Download the contents of a file object or multiple objects. Use "-o -" to
direct the output to stdout.

positional arguments:
  path                  Data object ID or name, or folder to download

optional arguments:
  -h, --help            show this help message and exit
  --env-help            Display help message for overriding environment
                        variables
  -o OUTPUT, --output OUTPUT
                        Local filename or directory to be used ("-" indicates
                        stdout output); if not supplied or a directory is
                        given, the object's name on the platform will be used,
                        along with any applicable extensions
  -f, --overwrite       Resume an interupted download if the local and remote


In [34]:
%%bash
head chr20_pgen.HT+DEM_VS_HT_TOWNSEND.Pheno.glm.logistic.hybrid

head: cannot open 'chr20_pgen.HT+DEM_VS_HT_TOWNSEND.Pheno.glm.logistic.hybrid' for reading: No such file or directory


CalledProcessError: Command 'b'head chr20_pgen.HT+DEM_VS_HT_TOWNSEND.Pheno.glm.logistic.hybrid\n'' returned non-zero exit status 1.

In [23]:
import pandas as pd
for CHR in range(1,23):
    print(CHR)
    AGE_SEX_MATCH_TEMP = pd.read_csv(f'chr{CHR}_pgen.Pheno_New_final_HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt.ADRD.glm.logistic.hybrid',
        delim_whitespace=True
    )
    if CHR==1:
        AGE_SEX_MATCH = AGE_SEX_MATCH_TEMP
    else:
        AGE_SEX_MATCH = pd.concat([AGE_SEX_MATCH,AGE_SEX_MATCH_TEMP])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [142]:
%%bash
head chr19_pgen.HT+DEM_VS_HT_TOWNSEND.Pheno.glm.logistic.hybrid

#CHROM	POS	ID	REF	ALT	A1	A1_CASE_CT	A1_CTRL_CT	CASE_ALLELE_CT	CTRL_ALLELE_CT	A1_FREQ	A1_CASE_FREQ	A1_CTRL_FREQ	FIRTH?	BETA	SE	P	ERRCODE
19	119135	19:119135:C:G	C	G	G	138.75	2153.68	11944	188162	0.0114561	0.0116167	0.0114459	N	0.0294587	0.135091	0.827378	.
19	125962	19:125962:T:A	T	A	A	141.168	2218.75	11944	188162	0.0117933	0.0118192	0.0117917	N	0.00873895	0.134434	0.94817	.
19	133816	19:133816:GTTCTC:G	GTTCTC	G	G	181.187	2684.76	11944	188162	0.0143222	0.0151697	0.0142684	N	0.137173	0.112182	0.221416	.
19	140292	19:140292:C:T	C	T	T	126.385	2043.92	11944	188162	0.0108458	0.0105815	0.0108626	N	-0.0638601	0.145922	0.661653	.
19	225487	19:225487:C:T	C	T	T	129.598	2225.85	11944	188162	0.011771	0.0108504	0.0118295	N	-0.102351	0.0961457	0.287085	.
19	230429	19:230429:C:T	C	T	T	176.327	2733.76	11944	188162	0.0145427	0.0147628	0.0145288	N	0.0256584	0.0935784	0.783938	.
19	231076	19:231076:TAAAAAG:T	TAAAAAG	T	T	494.923	7670.91	11944	188162	0.0408075	0.0414369	0.0407676	N	0.0188984	0.0504384	0.707

In [24]:
AGE_SEX_MATCH.to_csv("HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt", sep='\t', index=False)

In [25]:
!dx upload HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt

ID                                file-GxgGvk0JbP2VG03037f9Q24Q
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              HT_Dem_remove_PD_VD_FTD_Hu_CJD_Vs_HT_New_sbp.txt
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Fri Dec 27 15:57:24 2024
Created by                        vidhu
 via the job                      job-GxgF3f0JbP2pzzZF6vYjjb0v
Last modified                     Fri Dec 27 15:57:26 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [38]:
import pandas as pd

# Initialize an empty DataFrame to store the concatenated data
concatenated_df = pd.DataFrame()

# Loop through chromosome numbers 1 to 22
for i in range(1, 23):
    # Define the file name based on the chromosome number
    file_name = f'chr{i}_pgen.psam'
    
    # Read the .psam file into a DataFrame
    df = pd.read_csv(file_name, delim_whitespace=True)
    
    # Concatenate the current DataFrame with the accumulated DataFrame
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

# Save the concatenated DataFrame to a new .psam file
concatenated_df.to_csv('concatenated_chr1_22.psam', sep='\t', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'chr1_pgen.psam'

In [32]:
!dx upload concatenated_chr1_22.psam

ID                                file-GpgpYzQJbP2ZQfxxZyXP2p1y
Class                             file
Project                           project-GkYf2zQJbP2Q3vFgf14863Gf
Folder                            /
Name                              concatenated_chr1_22.psam
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Sun Aug  4 16:19:10 2024
Created by                        vidhu
 via the job                      job-Gpgjk40JbP2y77p042179KF3
Last modified                     Sun Aug  4 16:19:12 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [36]:
!grep 3684138 concatenated_chr1_22.psam

3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
3684138	3684138	2.0
