In [96]:
import pandas as pd
import os

In [109]:
# old phenotypes from Nadav's original run of PWAS

specs = [
    {
        'name': 'Height',
        'source': 'field',
        'field_id': 50,
        'field_type': 'continuous',
    },
    {
        'name': 'BMI',
        'source': 'field',
        'field_id': 21001,
        'field_type': 'continuous',
    },
    {
        'name': 'Waist circumference',
        'source': 'field',
        'field_id': 48,
        'field_type': 'continuous',
    },
    {
        'name': 'Hip circumference',
        'source': 'field',
        'field_id': 49,
        'field_type': 'continuous',
    },
    {
        'name': 'Diastolic blood pressure',
        'source': 'field',
        'field_id': 4079,
        'field_type': 'continuous',
    },
    {
        'name': 'Systolic blood pressure',
        'source': 'field',
        'field_id': 4080,
        'field_type': 'continuous',
    },
    {
        'name': 'Menarche (age at onset)',
        'source': 'field',
        'field_id': 2714,
        'field_type': 'continuous',
        'sex_filter': 'F',
    },
    {
        'name': 'Menopause (age at onset)',
        'source': 'field',
        'field_id': 3581,
        'field_type': 'continuous',
        'sex_filter': 'F',
    },
    {
        'name': 'Intraocular pressure',
        'source': 'aggregation',
        'subspecs': [
            {
                'name': 'Intraocular pressure (right)',
                'source': 'field',
                'field_id': 5254,
                'field_type': 'continuous',
            },
            {
                'name': 'Intraocular pressure (left)',
                'source': 'field',
                'field_id': 5262,
                'field_type': 'continuous',
            },
        ],
        'aggregation_function': lambda right, left: pd.concat([right, left], axis = 1).max(axis = 1), # We take the maximum between the two fields.
    },
    {
        'name': 'Hand grip strength',
        'source': 'aggregation',
        'subspecs': [
            {
                'name': 'Hand grip strength (left)',
                'source': 'field',
                'field_id': 46,
                'field_type': 'continuous',
            },
            {
                'name': 'Hand grip strength (right)',
                'source': 'field',
                'field_id': 47,
                'field_type': 'continuous',
            },
        ],
        'aggregation_function': lambda left, right: pd.concat([left, right], axis = 1).max(axis = 1), # We take the maximum between the two fields.
    },
    {
        'name': 'Male-pattern baldness',
        'source': 'field',
        'field_id': 2395,
        'field_type': 'set',
        'one_hot_encoding': True,
        'sex_filter': 'M',
    },
    {
        'name': 'Platelet count',
        'source': 'field',
        'field_id': 30080,
        'field_type': 'continuous',
    },
    {
        'name': 'Monocyte count',
        'source': 'field',
        'field_id': 30130,
        'field_type': 'continuous',
    },
    {
        'name': 'Red blood cell count',
        'source': 'field',
        'field_id': 30010,
        'field_type': 'continuous',
    },
    {
        'name': 'White blood cell count',
        'source': 'field',
        'field_id': 30000,
        'field_type': 'continuous',
    },
    {
        'name': 'High light scatter reticulocyte count',
        'source': 'field',
        'field_id': 30300,
        'field_type': 'continuous',
    },
    {
        'name': 'Eosinophil counts',
        'source': 'field',
        'field_id': 30150,
        'field_type': 'continuous',
    },
    {
        'name': 'Reticulocyte count',
        'source': 'field',
        'field_id': 30250,
        'field_type': 'continuous',
    },
    {
        'name': 'Lymphocyte counts',
        'source': 'field',
        'field_id': 30120,
        'field_type': 'continuous',
    },
    {
        'name': 'Mean platelet volume',
        'source': 'field',
        'field_id': 30100,
        'field_type': 'continuous',
    },
    {
        'name': 'Mean corpuscular volume',
        'source': 'field',
        'field_id': 30040,
        'field_type': 'continuous',
    },
    {
        'name': 'Mean corpuscular hemoglobin',
        'source': 'field',
        'field_id': 30050,
        'field_type': 'continuous',
    },
    {
        'name': 'Neutrophil count',
        'source': 'field',
        'field_id': 30140,
        'field_type': 'continuous',
    },
    {
        'name': 'Red cell distribution width',
        'source': 'field',
        'field_id': 30070,
        'field_type': 'continuous',
    },
    {
        'name': 'Platelet distribution width',
        'source': 'field',
        'field_id': 30110,
        'field_type': 'continuous',
    },
    {
        'name': 'High light scatter reticulocyte percentage of red cells',
        'source': 'field',
        'field_id': 30290,
        'field_type': 'continuous',
    },
    {
        'name': 'Breast cancer',
        'source': 'ICD-10',
        'codings': ['C50'],
        'sex_filter': 'F',
    },
    {
        'name': 'Epithelial ovarian cancer',
        'source': 'ICD-10',
        'codings': ['C56'],
        'sex_filter': 'F',
    },
    {
        'name': 'Prostate cancer',
        'source': 'ICD-10',
        'codings': ['C61'],
        'sex_filter': 'M',
    },
    {
        'name': 'Colorectal cancer',
        'source': 'ICD-10',
        'codings': ['C18'],
    },
    {
        'name': 'Lung cancer',
        'source': 'ICD-10',
        'codings': ['C34'],
    },
    {
        'name': 'Chronic lymphocytic leukemia',
        'source': 'ICD-10',
        'codings': ['C91'],
    },
    {
        'name': 'Pancreatic cancer',
        'source': 'ICD-10',
        'codings': ['C25'],
    },
    {
        'name': 'Melanoma',
        'source': 'ICD-10',
        'codings': ['C43'],
    },
    {
        'name': 'Schizophrenia',
        'source': 'ICD-10',
        'codings': ['F20'],
    },
    {
        'name': 'Bipolar disorder',
        'source': 'ICD-10',
        'codings': ['F31'],
    },
    {
        'name': 'Major depressive disorder',
        'source': 'ICD-10',
        'codings': ['F33'],
    },
    {
        'name': 'Parkinson\'s disease',
        'source': 'ICD-10',
        'codings': ['G20'],
    },
    {
        'name': 'Stroke',
        'source': 'ICD-10',
        'codings': ['I63'],
    },
    {
        'name': 'Hypertension',
        'source': 'ICD-10',
        'codings': ['I10'],
    },
    {
        'name': 'Sudden cardiac arrest',
        'source': 'ICD-10',
        'codings': ['I46'],
    },
    {
        'name': 'Type 1 diabetes',
        'source': 'ICD-10',
        'codings': ['E10'],
    },
    {
        'name': 'Type 2 diabetes',
        'source': 'ICD-10',
        'codings': ['E11'],
    },
    {
        'name': 'Systemic sclerosis',
        'source': 'ICD-10',
        'codings': ['M34'],
    },
    {
        'name': 'Multiple sclerosis',
        'source': 'ICD-10',
        'codings': ['G35'],
    },
    {
        'name': 'Systemic lupus erythematosus',
        'source': 'ICD-10',
        'codings': ['M32'],
    },
    {
        'name': 'Rheumatoid arthritis',
        'source': 'ICD-10',
        'codings': ['M05', 'M06'],
    },
    {
        'name': 'Asthma',
        'source': 'ICD-10',
        'codings': ['J45'],
    },
    {
        'name': 'Crohn\'s and colitis',
        'source': 'ICD-10',
        'codings': ['K50', 'K51'],
    },
]


In [122]:
PROJECT_PATH = "/cs/labs/michall/roeizucker/amos/continuous_run/"

In [110]:
# Convert the phenotype specs so they contain Sex filtering data

final_specs = []
new_specs = list(filter(lambda x: x["source"] == 'field',specs))
for spec in new_specs:
    if "baldness" in spec["name"]:
        continue
    spec["name"] = spec["name"].replace(" ","_")
    spec["name"] = spec["name"].replace("(","")
    spec["name"] = spec["name"].replace(")","")
    male = spec.copy()
    male['sex_filter'] = "M"
    male['name']+="_M"
    female = spec.copy()
    female['sex_filter'] = "F"
    female['name']+="_F"
    both = spec.copy()
    final_specs.append(male)
    final_specs.append(female)
    final_specs.append(both)
# new_specs[0].copy()["s"] = "b"
# new_specs[0]

In [111]:
len(final_specs)

69

In [112]:
# Copy the output to the phenotype_specs.py file
final_specs

[{'name': 'Height_M',
  'source': 'field',
  'field_id': 50,
  'field_type': 'continuous',
  'sex_filter': 'M'},
 {'name': 'Height_F',
  'source': 'field',
  'field_id': 50,
  'field_type': 'continuous',
  'sex_filter': 'F'},
 {'name': 'Height',
  'source': 'field',
  'field_id': 50,
  'field_type': 'continuous'},
 {'name': 'BMI_M',
  'source': 'field',
  'field_id': 21001,
  'field_type': 'continuous',
  'sex_filter': 'M'},
 {'name': 'BMI_F',
  'source': 'field',
  'field_id': 21001,
  'field_type': 'continuous',
  'sex_filter': 'F'},
 {'name': 'BMI',
  'source': 'field',
  'field_id': 21001,
  'field_type': 'continuous'},
 {'name': 'Waist_circumference_M',
  'source': 'field',
  'field_id': 48,
  'field_type': 'continuous',
  'sex_filter': 'M'},
 {'name': 'Waist_circumference_F',
  'source': 'field',
  'field_id': 48,
  'field_type': 'continuous',
  'sex_filter': 'F'},
 {'name': 'Waist_circumference',
  'source': 'field',
  'field_id': 48,
  'field_type': 'continuous'},
 {'name': 'Hi

In [19]:
! cp /cs/labs/michall/roeizucker/amos/continuous_run/phenotype_specs.py /cs/labs/michall/roeizucker/amos/continuous_run/phenotype_specs_old.py

In [114]:
# Create scripts for running PWAS for each phenotype in the new specs. Script output can be used to run on the cluster

for spec in final_specs:
    dir_name = "/cs/labs/michall/roeizucker/amos/continuous_run/" +spec["name"]
    if not os.path.exists(dir_name):
        !mkdir '{dir_name}'
    file_count = len([name for name in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name, name))])
    if file_count == 17924:
        continue
    file_content = 'source ~/my_storage/temp_virt_env/bin/activate\npwas_test_genes --resolve-quasi-complete-covariate-separation --remove-multicollinear-covariate --dataset-file=/cs/labs/michall/roeizucker/amos/continuous_run/ukbb_dataset.csv --gene-effect-scores-dir=/cs/labs/michall/roeizucker/virt_env_install_test/ukbb_imputation_gene_effect_scores/ --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/'+ spec["name"] + ' --sample-id-col=eid --phenotype-col=\''+ spec["name"] + '\' --covariate-cols-json-file=/cs/labs/michall/roeizucker/amos/continuous_run/ukbb_covariate_columns.json --task-index-env-variable=SLURM_ARRAY_TASK_ID --total-tasks-env-variable=SLURM_ARRAY_TASK_COUNT\n'
    bash_file_name = "/cs/labs/michall/roeizucker/amos/continuous_run/run_" +spec["name"] + ".sh"
    with open(bash_file_name,'w') as bash_file:
        bash_file.write(file_content)
    !chmod 744 '{bash_file_name}'
    print("#",bash_file_name)
    print(f'sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="{bash_file_name}"')
    print(f"echo {bash_file_name} running")
    print("sleep 60m")


# /cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference_F.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference_F.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference_F.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_Hip_circumference.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_Diastolic_blood_pressure_M.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Diastolic_blood_pressure_M.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_Diastolic_blood_pressure_M.sh running
sleep 60m
# /cs/l

# /cs/labs/michall/roeizucker/amos/continuous_run/run_White_blood_cell_count.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_White_blood_cell_count.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_White_blood_cell_count.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_High_light_scatter_reticulocyte_count_M.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_High_light_scatter_reticulocyte_count_M.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_High_light_scatter_reticulocyte_count_M.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_High_light_scatter_reticulocyte_count_F.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_High_light_scatter_reticulocyte_count_F.sh"
echo /cs/la

# /cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_M.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_M.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_M.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_F.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_F.sh"
echo /cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width_F.sh running
sleep 60m
# /cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width.sh
sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/run_Red_cell_distribution_width.sh"
echo /cs/labs/michall/roeizucker/amos/contin

In [130]:
# combine results
for spec in final_specs:
    dir_name = "/cs/labs/michall/roeizucker/amos/continuous_run/" +spec["name"]
    results_dir = PROJECT_PATH + "results/"
#     if not os.path.exists(dir_name):
#         !mkdir '{dir_name}'
    print(f"combine_pwas_results --genes-file=/cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv --per-gene-pwas-results-dir={dir_name} --results-file={results_dir + spec['name']}.csv")
#     file_count = len([name for name in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name, name))])
#     if file_count == 17924:
#         continue
#     file_content = 'source ~/my_storage/temp_virt_env/bin/activate\npwas_test_genes --resolve-quasi-complete-covariate-separation --remove-multicollinear-covariate --dataset-file=/cs/labs/michall/roeizucker/amos/continuous_run/ukbb_dataset.csv --gene-effect-scores-dir=/cs/labs/michall/roeizucker/virt_env_install_test/ukbb_imputation_gene_effect_scores/ --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/'+ spec["name"] + ' --sample-id-col=eid --phenotype-col=\''+ spec["name"] + '\' --covariate-cols-json-file=/cs/labs/michall/roeizucker/amos/continuous_run/ukbb_covariate_columns.json --task-index-env-variable=SLURM_ARRAY_TASK_ID --total-tasks-env-variable=SLURM_ARRAY_TASK_COUNT\n'
#     bash_file_name = "/cs/labs/michall/roeizucker/amos/continuous_run/run_" +spec["name"] + ".sh"
#     with open(bash_file_name,'w') as bash_file:
#         bash_file.write(file_content)
#     !chmod 744 '{bash_file_name}'
#     print("#",bash_file_name)
#     print(f'sbatch --array=0-150 --mem=15g -c1 --time=1-0 --killable --requeue --wrap="{bash_file_name}"')
#     print(f"echo {bash_file_name} running")
#     print("sleep 60m")


combine_pwas_results --genes-file=/cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/Height_M --results-file=/cs/labs/michall/roeizucker/amos/continuous_run/results/Height_M.csv
combine_pwas_results --genes-file=/cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/Height_F --results-file=/cs/labs/michall/roeizucker/amos/continuous_run/results/Height_F.csv
combine_pwas_results --genes-file=/cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/Height --results-file=/cs/labs/michall/roeizucker/amos/continuous_run/results/Height.csv
combine_pwas_results --genes-file=/cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv --per-gene-pwas-results-dir=/cs/labs/michall/roeizucker/amos/continuous_run/BMI_M --results-f

In [129]:
!head /cs/usr/roeizucker/my_storage/virt_env_install_test/genes_hg19.csv

,uniprot_id,symbol,name,refseq_ids,chr,cds_start,cds_end
0,A0A075B6H7,IGKV3-7,immunoglobulin kappa variable 3-7 (non-functional),['NG_000834'],2,89277987,89278503
1,A0A075B6H9,IGLV4-69,immunoglobulin lambda variable 4-69,['NG_000002'],22,22385392,22385870
2,A0A075B6I0,IGLV8-61,immunoglobulin lambda variable 8-61,['NG_000002'],22,22453156,22453622
3,A0A075B6I1,IGLV4-60,immunoglobulin lambda variable 4-60,['NG_000002'],22,22516592,22517074
4,A0A075B6I4,IGLV10-54,immunoglobulin lambda variable 10-54,['NG_000002'],22,22569197,22569660
5,A0A075B6I6,IGLV1-50,immunoglobulin lambda variable 1-50 (non-functional),['NG_000002'],22,22681709,22682172
6,A0A075B6J1,IGLV5-37,immunoglobulin lambda variable 5-37,['NG_000002'],22,22781881,22782371
7,A0A075B6J6,IGLV3-22,immunoglobulin lambda variable 3-22,['NG_000002'],22,23046814,23047307
8,A0A075B6K5,IGLV3-9,immunoglobulin lambda variable 3-9,['NG_000002'],22,23161622,23162253


In [45]:
count = 0
for spec in final_specs:
    dir_name = "/cs/labs/michall/roeizucker/amos/continuous_run/" +spec["name"]
    file_count = len([name for name in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name, name))])
    if file_count == 17924:
        count+=1
print(count)

10


In [115]:
len(final_specs)

69

## create plink format files

### create pheotype files

In [118]:
PHEN_START = 2
PHEN_END = 83
MAX_JOBS_BEFORE_SUBMITTING_NEW_ARRAY = 200
# TODO: add const of array length
PLINK_PATH = "/cs/usr/nadavb/third_party/plink2"
priority_gwas_dir = "/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/"
data_file_path = priority_gwas_dir + "data_file_2.csv"
mediator_script_path = priority_gwas_dir + "mediator.py"
master_script_path = priority_gwas_dir + "master.sh"
df = pd.read_csv(DISCRIPTION_FILE)
BATCH_LENGTH = 250


In [121]:
!head {data_file_path} -n 5

,unsorted_counter,phenotype_path,output_path,partial_chromosome_file_path,covariates,chr
0,0,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menarche_age_at_onset_m.txt,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_m_chr1,/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt,1
1,23,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menopause_age_at_onset_m.txt,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menopause_age_at_onset_m_chr1,/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt,1
2,1,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menarche_age_at_onset_m.txt,/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_m_chr2,/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch2,/cs/labs/

In [79]:
def create_files(gwas_path,phen_path,phenotypes):
    !mkdir {gwas_path}
    !mkdir {gwas_path}/results
    !mkdir {gwas_path}/phenotypes
    !cp /cs/labs/michall/roeizucker/10krun/runs/0:11/GWAS_delete_me/covariates.txt {gwas_path}/covariates.txt
    !cd {gwas_path}
    dataset = pd.read_csv(phen_path)
    for phenotype_col in phenotypes:
        print(phenotype_col)
        file_name = phenotype_col.lower().replace(' ', '_').replace('-', '_') + '.txt'
        values = dataset[['eid', 'eid', phenotype_col]].dropna()
        values.to_csv(os.path.join(f"{gwas_path}/phenotypes", file_name), \
            header = False, index = False, sep = '\t')
phen_path = "/cs/labs/michall/roeizucker/amos/continuous_run/ukbb_dataset.csv"
gwas_path = "/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/"
phenotype_cols = list(pd.read_csv(phen_path))[PHEN_START:PHEN_END]
phenotype_cols
create_files(gwas_path,phen_path,phenotype_cols)

mkdir: cannot create directory ‘/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/’: File exists
mkdir: cannot create directory ‘/cs/labs/michall/roeizucker/amos/continuous_run/GWAS//results’: File exists
mkdir: cannot create directory ‘/cs/labs/michall/roeizucker/amos/continuous_run/GWAS//phenotypes’: File exists
Height_M
Height_F
Height
BMI_M
BMI_F
BMI
Waist_circumference_M
Waist_circumference_F
Waist_circumference
Hip_circumference_M
Hip_circumference_F
Hip_circumference
Diastolic_blood_pressure_M
Diastolic_blood_pressure_F
Diastolic_blood_pressure
Systolic_blood_pressure_M
Systolic_blood_pressure_F
Systolic_blood_pressure
Menarche_age_at_onset_M
Menarche_age_at_onset_F
Menarche_age_at_onset
Menopause_age_at_onset_M
Menopause_age_at_onset_F
Menopause_age_at_onset
Male-pattern_baldness_M (1.0)
Male-pattern_baldness_M (2.0)
Male-pattern_baldness_M (3.0)
Male-pattern_baldness_M (4.0)
Male-pattern_baldness_F (1.0)
Male-pattern_baldness_F (2.0)
Male-pattern_baldness_F (3.0)
Male-patte

### create the data file

In [103]:
# An array is created with all the values needed for running GWAS.
# The array will be sorted in batches for better file accessing
values = []
last_counter = 0
cur_phen_counter = 0
counter = 0
for spec in final_specs:
    base_path =  "/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/"
    for j in range(1,23):
        if os.path.exists(base_path + f"results/{spec['name'].lower()}_chr{j}.PHENO1.glm.linear"):
            continue
        values.append([counter,base_path+ "phenotypes/"+ spec['name'].lower() + ".txt",
                       base_path + f"results/{spec['name'].lower()}_chr{j}",
                       f"/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch{j}"
                       ,base_path +"covariates.txt",j])
        counter+=1
    cur_phen_counter+=1
    if cur_phen_counter%BATCH_LENGTH == 0:
        values[last_counter:counter] = sorted(values[last_counter:counter],key = lambda x:x[-1])
        last_counter = counter
values[last_counter:counter] = sorted(values[last_counter:counter],key = lambda x:x[-1])
df = pd.DataFrame(values, columns=["unsorted_counter","phenotype_path","output_path","partial_chromosome_file_path", "covariates","chr"])
df.to_csv(data_file_path)

In [105]:
values

[[0,
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menarche_age_at_onset_m.txt',
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_m_chr1',
  '/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1',
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt',
  1],
 [23,
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menopause_age_at_onset_m.txt',
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menopause_age_at_onset_m_chr1',
  '/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1',
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt',
  1],
 [1,
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menarche_age_at_onset_m.txt',
  '/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_m_chr2',
  '/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch2',
  '/cs/labs/michall/roeizucker/amos

In [92]:
# script_file.write(f"{PLINK_PATH} --bed /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch{j}.bed --bim /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch{j}.bim --fam /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch{j}.fam --pheno {GWAS_PATH}/phenotypes/{phenotypes[i].lower()}.txt --covar {GWAS_PATH}/covariates.txt --out {GWAS_PATH}/results/{phenotypes[i]}_chr{j} --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000\n")
val = df.loc[3]["unsorted_counter"]

print(f"{PLINK_PATH} --bed {df.loc[3].partial_chromosome_file_path}.bed --bim {df.loc[3].partial_chromosome_file_path}.bim --fam {df.loc[3].partial_chromosome_file_path}.fam --pheno {df.loc[3].phenotype_path} --covar {df.loc[3].covariates} --out {df.loc[3].output_path} --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000\n")
df

/cs/usr/nadavb/third_party/plink2 --bed /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.bed --bim /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.bim --fam /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.fam --pheno /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/bmi_m.txt --covar /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt --out /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/bmi_m_chr1 --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000



Unnamed: 0,unsorted_counter,phenotype_path,output_path,partial_chromosome_file_path,covariates,chr
0,0,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,1
1,22,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,1
2,44,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,1
3,66,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,1
4,88,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,1
...,...,...,...,...,...,...
1513,1429,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,22
1514,1451,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,22
1515,1473,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,22
1516,1495,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/amos/continuous_ru...,/cs/labs/michall/roeizucker/plink_results/redu...,/cs/labs/michall/roeizucker/amos/continuous_ru...,22


### create scripts

In [93]:
with open(mediator_script_path, "w") as mediator_file:
    mediator_file.write(f'''import pandas as pd
import sys
import os
import os.path
butch_num = int(sys.argv[1]) 
curr_task = int(sys.argv[2])
BATCH_LENGTH = {BATCH_LENGTH}
SKIP_POINT = {MAX_JOBS_BEFORE_SUBMITTING_NEW_ARRAY}
PLINK_PATH = "/cs/usr/nadavb/third_party/plink2"

data_file_path = "{data_file_path}"
df = pd.read_csv(data_file_path)
print(df.loc[butch_num * BATCH_LENGTH + curr_task])
val = df.loc[butch_num * BATCH_LENGTH + curr_task]
location = butch_num * BATCH_LENGTH + curr_task
if curr_task == SKIP_POINT and location < 30500:
    new_batch  = butch_num + 1
    if not os.path.isfile("{priority_gwas_dir}_" + str(new_batch) + "_flag"):
        os.system("sbatch --array=0-"+str(BATCH_LENGTH - 1)+" --mem=12g -c10 --time=3-0 --killable --requeue --wrap=\\"{priority_gwas_dir}master.sh "+str(new_batch)+"\\"")
if location < 30500:
    print(PLINK_PATH + " --bed " + df.loc[location].partial_chromosome_file_path + ".bed --bim " + df.loc[location].partial_chromosome_file_path + ".bim --fam "+df.loc[location].partial_chromosome_file_path + ".fam --pheno " + df.loc[location].phenotype_path + " --covar " + df.loc[location].covariates + "  --out  " + df.loc[location].output_path + " --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000")
    os.system(PLINK_PATH + " --bed " + df.loc[location].partial_chromosome_file_path + ".bed --bim " + df.loc[location].partial_chromosome_file_path + ".bim --fam "+df.loc[location].partial_chromosome_file_path + ".fam --pheno " + df.loc[location].phenotype_path + " --covar " + df.loc[location].covariates + "  --out  " + df.loc[location].output_path + " --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000")
''')

    




In [107]:
print(PLINK_PATH + " --bed " + df.loc[0].partial_chromosome_file_path + ".bed --bim " + df.loc[0].partial_chromosome_file_path + ".bim --fam "+df.loc[0].partial_chromosome_file_path + ".fam --pheno " + df.loc[0].phenotype_path + " --covar " + df.loc[0].covariates + "  --out  " + df.loc[0].output_path + " --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000")

/cs/usr/nadavb/third_party/plink2 --bed /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.bed --bim /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.bim --fam /cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch1.fam --pheno /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/phenotypes/menarche_age_at_onset_m.txt --covar /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/covariates.txt  --out  /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_m_chr1 --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000


In [94]:
with open(master_script_path, "w") as master_file:
    master_file.write(f'''
FILE={priority_gwas_dir}_$1_flag
if ! test "$FILE" 
then 
    touch {priority_gwas_dir}_$1_flag
fi
python {mediator_script_path} $1 $SLURM_ARRAY_TASK_ID
''')
!chmod 744 {master_script_path}

In [None]:
# how to run
# sbatch --array=0-263 --mem=12g -c10 --time=3-0 --requeue --killable --wrap="/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/master.sh 0"

In [156]:
master_script_path

'/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/master.sh'

In [145]:
# combine_gwas_results
# !ls {/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results}
import glob, os
path = "/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/"
os.chdir(path)
files = []
for file in glob.glob("*.glm.linear"):
    files.append(file)


In [162]:
phens = set()
for file in files:
#     print(file.split("_chr")[0])
    phens.add(file.split("_chr")[0])
for phen in phens:
    if os.path.exists(f"{path+phen}.csv"):
#         print(f"{path+phen}.csv")
        continue
    stopped = False
    dataframes = []
    for j in range(1,23):
        file_name = os.path.join(f"{path}",f"{phen}_chr{j}.PHENO1.glm.linear")
        if not os.path.exists(file_name):
            print(file_name)
            stopped = True
            break
        df = pd.read_csv(file_name,sep="\t")
        dataframes.append(df)
#         print(df.head())
    if stopped:
        continue
    master_df = pd.concat(dataframes)
    master_df.to_csv(f"{path+phen}.csv")
#     print(master_df.head())
#     input()
    
#         print(f"{path}/{phen}_chr{j}.PHENO1.glm.linear")

/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menopause_age_at_onset_f_chr3.PHENO1.glm.linear
/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menarche_age_at_onset_chr3.PHENO1.glm.linear
/cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/menopause_age_at_onset_chr4.PHENO1.glm.linear


In [141]:
!head /cs/labs/michall/roeizucker/amos/continuous_run/GWAS/results/mean_corpuscular_hemoglobin_m_chr9.PHENO1.glm.linear

#CHROM	POS	ID	REF	ALT	A1	TEST	OBS_CT	BETA	SE	T_STAT	P
9	116837	rs528874495	A	G	A	ADD	150222	0.0904957	0.161734	0.559534	0.575798
9	116841	rs550119004	A	G	A	ADD	150340	NA	NA	NA	NA
9	116848	rs532235124	TA	T	TA	ADD	149953	-0.724427	0.363151	-1.99484	0.0460623
9	116861	rs143746049	A	G	A	ADD	150318	NA	NA	NA	NA
9	116897	rs559143484	T	G	T	ADD	150331	-0.236314	0.710789	-0.332467	0.739537
9	116931	rs535147968	A	G	A	ADD	150339	-0.470826	0.870426	-0.540914	0.588568
9	116973	rs148746035	A	C	A	ADD	149946	-0.0153754	0.12511	-0.122895	0.902191
9	117050	rs528293604	T	G	T	ADD	150336	1.05097	0.778559	1.34989	0.177052
9	117125	rs201298466	G	T	G	ADD	150336	0.0714161	1.23098	0.0580157	0.953736
