In [2]:
import pandas as pd
import os


In [20]:
# input files
PHEN_PATH = "/cs/labs/michall/roeizucker/non_caucasian_run_2/african_ukbb_dataset.csv"
GWAS_PATH = "/cs/labs/michall/roeizucker/non_caucasian_run_2/GWAS"
PHENOTYPES = ["J45_african"]
RESULT_SUFFIX = "PHENO1.glm.logistic"

In [13]:
!ls {GWAS_PATH}

ls: cannot access '/cs/labs/michall/roeizucker/non_caucasian_run_2/GWAS_asian': No such file or directory


In [14]:
# script body (will be transferred to another file)

# script consts
PHEN_START = 2
VALUE_AT_END_OF_PHENOTYPES = 'const'
PLINK_PATH = "/cs/usr/nadavb/third_party/plink2"
BATCH_LENGTH = 250
MAX_JOBS_BEFORE_SUBMITTING_NEW_ARRAY = 200

# TODO: add verbous argument, and better explain the output
def create_files(gwas_path,phen_path,phenotypes):
    !mkdir {gwas_path}
    !mkdir {gwas_path}/results
    !mkdir {gwas_path}/phenotypes
    !cp /cs/labs/michall/roeizucker/10krun/runs/0:11/GWAS_delete_me/covariates.txt {gwas_path}/covariates.txt
    !cd {gwas_path}
    dataset = pd.read_csv(phen_path)
    for phenotype_col in phenotypes:
        print(phenotype_col)
        file_name = phenotype_col.lower().replace(' ', '_').replace('-', '_') + '.txt'
        values = dataset[['eid', 'eid', phenotype_col]].dropna()
        values.to_csv(os.path.join(f"{gwas_path}/phenotypes", file_name), \
            header = False, index = False, sep = '\t')

        # An array is created with all the values needed for running GWAS.
# The array will be sorted in batches for better file accessing
def create_the_data_file(phenotype_cols,gwas_path,data_file_path):
    values = []
    last_counter = 0
    cur_phen_counter = 0
    counter = 0
    for phen in phenotype_cols:
        base_path =  gwas_path
        for j in range(1,23):
            if os.path.exists(os.path.join(base_path ,f"results/{phen.lower()}_chr{j}.PHENO1.glm.linear")):
                continue
            # TODO: change location of plink files to be const
            values.append([counter,os.path.join(base_path, "phenotypes/"+ phen.lower() + ".txt"),
                           os.path.join(base_path, f"results/{phen.lower()}_chr{j}"),
                           f"/cs/labs/michall/roeizucker/plink_results/reduced_snps2/small_ch{j}"
                           ,os.path.join(base_path ,"covariates.txt"),j])
            counter+=1
        cur_phen_counter+=1
        if cur_phen_counter%BATCH_LENGTH == 0:
            values[last_counter:counter] = sorted(values[last_counter:counter],key = lambda x:x[-1])
            last_counter = counter
    values[last_counter:counter] = sorted(values[last_counter:counter],key = lambda x:x[-1])
    df = pd.DataFrame(values, columns=["unsorted_counter","phenotype_path","output_path","partial_chromosome_file_path", "covariates","chr"])
    df.to_csv(data_file_path)


def write_mediator_script(mediator_script_path,data_file_path,gwas_path):
    with open(mediator_script_path, "w") as mediator_file:
        mediator_file.write(f'''import pandas as pd
import sys
import os
import os.path
butch_num = int(sys.argv[1]) 
curr_task = int(sys.argv[2])
BATCH_LENGTH = {BATCH_LENGTH}
SKIP_POINT = {MAX_JOBS_BEFORE_SUBMITTING_NEW_ARRAY}
PLINK_PATH = "/cs/usr/nadavb/third_party/plink2"

data_file_path = "{data_file_path}"
df = pd.read_csv(data_file_path)
print(df.loc[butch_num * BATCH_LENGTH + curr_task])
val = df.loc[butch_num * BATCH_LENGTH + curr_task]
location = butch_num * BATCH_LENGTH + curr_task
if curr_task == SKIP_POINT and location < 30500:
    new_batch  = butch_num + 1
    if not os.path.isfile("{gwas_path}_" + str(new_batch) + "_flag"):
        os.system("sbatch --array=0-"+str(BATCH_LENGTH - 1)+" --mem=12g -c10 --time=3-0 --killable --requeue --wrap=\\"{gwas_path}master.sh "+str(new_batch)+"\\"")
if location < 30500:
    print(PLINK_PATH + " --bed " + df.loc[location].partial_chromosome_file_path + ".bed --bim " + df.loc[location].partial_chromosome_file_path + ".bim --fam "+df.loc[location].partial_chromosome_file_path + ".fam --pheno " + df.loc[location].phenotype_path + " --covar " + df.loc[location].covariates + "  --out  " + df.loc[location].output_path + " --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000")
    os.system(PLINK_PATH + " --bed " + df.loc[location].partial_chromosome_file_path + ".bed --bim " + df.loc[location].partial_chromosome_file_path + ".bim --fam "+df.loc[location].partial_chromosome_file_path + ".fam --pheno " + df.loc[location].phenotype_path + " --covar " + df.loc[location].covariates + "  --out  " + df.loc[location].output_path + " --1 --glm hide-covar --mac 20 --covar-variance-standardize --freq --threads 10 --memory 10000")
    print("done!")
''')

def write_master_script(master_script_path,mediator_script_path,gwas_path):
    with open(master_script_path, "w") as master_file:
        master_file.write(f'''
FILE={gwas_path}_$1_flag
if ! test "$FILE" 
then 
    touch {gwas_path}_$1_flag
fi
python {mediator_script_path} $1 $SLURM_ARRAY_TASK_ID
''')
    !chmod 744 {master_script_path}

# set input 
# TODO: change so it is accepted as params
phen_path = PHEN_PATH
gwas_path = GWAS_PATH

# do stuff
phenotype_cols = list(pd.read_csv(PHEN_PATH))
phenotype_cols = phenotype_cols[2:phenotype_cols.index(VALUE_AT_END_OF_PHENOTYPES)]
if len (PHENOTYPES) > 0:
    phenotype_cols = PHENOTYPES
# TODO: chnage so names are consts
data_file_path = os.path.join(gwas_path, "data_file.csv")
mediator_script_path = os.path.join(gwas_path , "mediator.py")
master_script_path = os.path.join(gwas_path , "master.sh")

create_files(gwas_path,phen_path,phenotype_cols)
create_the_data_file(phenotype_cols,gwas_path,data_file_path)
write_mediator_script(mediator_script_path,data_file_path,gwas_path)
write_master_script(master_script_path,mediator_script_path,gwas_path)

# TODO: add output folder
print(f'''how to run:
sbatch --array=0-263 --mem=12g -c10 --time=3-0 --requeue --killable --wrap="{master_script_path} 0"''')

J45_asian
how to run:
sbatch --array=0-263 --mem=12g -c10 --time=3-0 --requeue --killable --wrap="/cs/labs/michall/roeizucker/non_caucasian_run_2/GWAS_asian/master.sh 0"


In [4]:
# master_script_path = priority_gwas_dir + "master.sh"

# (mediator_script_path,data_file_path,gwas_path)
master_script_path

'/cs/labs/michall/roeizucker/non_caucasian_run_2/GWAS_caucasian_reduced/master.sh'

In [5]:
print(f'''how to run:
sbatch --array=0-263 --mem=12g -c10 --time=3-0 --requeue --killable --wrap="{master_script_path} 0"''')

how to run:
sbatch --array=0-263 --mem=12g -c10 --time=3-0 --requeue --killable --wrap="/cs/labs/michall/roeizucker/non_caucasian_run_2/GWAS_caucasian_reduced/master.sh 0"


## combines GWAS results

In [19]:
import glob, os
# path = "/cs/labs/michall/roeizucker/10krun/runs/48:59/GWAS/results"
# os.chdir(path)
path = os.path.join(GWAS_PATH,"results")
# files = []
# for file in glob.glob("*.glm.*"):
#     files.append(file)
phens = PHENOTYPES
for phen in phens:
    dataframes = []
    for j in range(1,23):
#         change so logistic/liner is detrmined generically
        file_name = os.path.join(path,f"{phen}_chr{j}.{RESULT_SUFFIX}")
        df = pd.read_csv(file_name,sep="\t")
        dataframes.append(df)
#         print(df.head())
    master_df = pd.concat(dataframes)
    master_df.to_csv(f"{os.path.join(path,phen)}.csv",index=False)
    print(f"{os.path.join(path,phen)}.csv")

/cs/labs/michall/roeizucker/10krun/runs/84:95/GWAS/results/J45.csv


In [None]:
PHENOTYPES