In [35]:
import pandas as pd

In [36]:
def prep_IBD_PC_for_plink(filepath):
    """
    Function to prepare IBD (Inflammatory bowel disease) and PC (Prostate cancer) files
    removes all collumns containing only Nan's
    removes all rows that do not have a variant_id
    drops all not needed collumns
    renames all needed collumns to useable names
    
    """
    data = pd.read_csv(filepath, sep="\s+")
    data = data[data["hm_variant_id"].notna()]
    data = data.dropna(axis=1)
    data = data[["chromosome", "base_pair_location", "variant_id", "beta", "standard_error", "p_value"]]
    data = data.rename(columns={"chromosome":"CHR", "base_pair_location":"POS", "variant_id":"SNP", "beta":"BETA", "standard_error":"SE", "p_value":"P"})
    file = filepath.split(".", 1)[0]
    data.to_csv(file + "_prepped.txt", sep="\t", index=False)

All files are ran through Plink with the following flags:

plink
--bfile 1000G/1000G.EUR
--clump filepath
--clump-p1 5e-8
--clump-r2 0.10
--clump-kb 1000
--out (IBD/PC/Height)_5e8_1000kb_r2_01


In [37]:
def prep_for_depict(filepath):
    """
    Removes all collumns exept SNP. this is needed for Depict
    run for IBD, PC, and Height file
    """
    data = pd.read_csv(filepath, sep="\s+")
    data = data["SNP"]
    file = filepath.split(".", 1)[0]
    data.to_csv(file + "_prepped4depict.txt", sep="\t", index=False, header=False)

In [38]:
def resample_Height(filepath):
    """
    The Height SNP file is too large for Depict.
    Therefor it was found it could not have more than 200 SNP's
    Takes 200 random SNP's from Height file to use in Depict and saves this to Height_200.txt
    """
    data = pd.read_csv(filepath, sep="\t")
    data = data.sample(n=200, axis=0)
    file = filepath.rsplit("/", 1)[0]
    if file == filepath:
        file = ""
    data.to_csv(file + "Height_200.txt", sep="\t", index=False, header=False)

In [39]:
IBD = "C:/Users/Pin/Desktop/Execute_methods/editsnpfiles/28067908-GCST004131-EFO_0003767.h.tsv"
PC = "C:/Users/Pin/Desktop/Execute_methods/editsnpfiles/29892016-GCST006085-EFO_0001663.h.tsv"

In [40]:
prep_IBD_PC_for_plink(IBD)

In [6]:
prep_IBD_PC_for_plink(PC)

  prep_IBD_PC_for_plink(PC)


In [8]:
IBD2 = "IBD_5e8_1000kb_r2_01.clumped"
PC2 = "PC_5e8_1000kb_r2_01.clumped"
Height = "Height_5e8_1000kb_r2_01.clumped"

In [18]:
prep_for_depict(IBD2)
prep_for_depict(PC2)
prep_for_depict(Height)

In [19]:
Height2 = "Height_5e8_1000kb_r2_01_prepped4depict.txt"

In [None]:
resample_Height(Height2)