/
19_MOSAIC_loc_anc.py
205 lines (181 loc) · 10.9 KB
/
19_MOSAIC_loc_anc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#"imputes" local ancestry between markers to use local ancestry as a covariate in GEMMA on a SNP-by-SNP basis
#example input - python 19_loc_anc_to_covar.py --loc_anc local_anc_1_HCHS.csv --snpfile /home/angela/px_his_chol/ancestry_pipeline/HCHS/no_NativeAmerican-h/PrediXcan_SNPs/sep_pops/snpfile.1
import argparse
import numpy as np
import pandas as pd
import os
#unnote when out of testing
parser = argparse.ArgumentParser()
parser.add_argument("--loc_anc", type = str, action = "store", dest = "loc_anc", required = True, help = "Path to file containing local ancestry converted from MOSAIC (output of 18_convert_MOSAIC_output.R)")
parser.add_argument("--output_prefix", type = str, action = "store", dest = "output_prefix", required = False, default = "MOSAIC_for_GEMMA", help = "Prefix for 'imputed' ancestry file")
parser.add_argument("--snpfile", type = str, action = "store", dest = "snpfile", required = True, help = "Path to snpfile used in HAPI-UR")
parser.add_argument("--sig_genes", type = str, action = "store", dest = "sig_genes", required = False, help = "List of genes to prune the list of SNPs around (1 Mb before and after)")
args = parser.parse_args()
print("Reading input files.")
local_anc = pd.read_csv(args.loc_anc, dtype={'bp':float})#, engine='python')
snpfile = pd.read_table(args.snpfile, delim_whitespace = True, header = None)
output_prefix = args.output_prefix
chr = int(args.snpfile[-2:].replace(".", ""))
if args.sig_genes is None:
sig_genes = [] #keep all SNPs. SIGNIFICANTLY SLOWER.
print("No significant gene file called, so program will be keeping all SNPs and will be significantly slowed.")
else:
sig_genes = np.loadtxt(args.sig_genes, dtype = 'string')
gene_start_end = pd.read_csv("/home/angela/px_his_chol/ancestry_pipeline/HCHS/no_NativeAmerican-h/PrediXcan_SNPs/sep_pops/100_ind/gene_start_end.csv")
#testing files
#in /home/angela/px_his_chol/ancestry_pipeline/HCHS/no_NativeAmerican-h/PrediXcan_SNPs/sep_pops/100_ind/
'''
local_anc = pd.read_csv("local_anc_1_HCHS_100000.csv", dtype={'bp':float}, engine='python')
snpfile = pd.read_table("snpfile.1", delim_whitespace = True, header = None)
gene_start_end = pd.read_csv("gene_start_end.csv")
output_prefix = "TEST_"
chr = 1
sig_genes = ["PABPC4", "TRIT1"]
'''
local_anc = local_anc[['bp', 'haplotype', 'anc']]
snpfile.columns = ['rs', 'chr', 'cM', 'bp', 'A1', 'A2']
snpfile = snpfile[['rs', 'bp']]
hap_list = local_anc['haplotype'].unique() #keep one of each hap
#or input fam file used to make GEMMA and go from there?
ind_list = []
for hap in hap_list:
ind_list.append(hap[:-2])
ind_list = sorted(set(ind_list), key = ind_list.index) #remove duplicates but preserve order
gene_start_end_chr = gene_start_end.loc[gene_start_end['chr'] == chr] #subset genes to only relevant chr
#prune gene_start_end_chr to just sig_genes
keep_gene_start_end = []
if len(sig_genes) == 0:
for gene_start_end_chr_row in gene_start_end_chr.itertuples():
keep_gene_start_end.append(list(gene_start_end_chr_row))
keep_gene_start_end = pd.DataFrame(keep_gene_start_end)
print("Kept all genes on chr. " + str(chr) + ".")
keep_gene_start_end.columns = ['index', 'chr', 'gene', 'gene_name', 'start', 'end']
keep_gene_start_end = keep_gene_start_end.drop('index', axis=1).drop('gene', axis=1).drop('chr', axis=1)
else:
for gene_start_end_chr_row in gene_start_end_chr.itertuples():
if gene_start_end_chr_row[3] in sig_genes:
keep_gene_start_end.append(list(gene_start_end_chr_row))
keep_gene_start_end = pd.DataFrame(keep_gene_start_end)
print("Kept the " + str(len(keep_gene_start_end.index)) + " significant genes on chr. " + str(chr) + ".")
keep_gene_start_end.columns = ['index', 'chr', 'gene', 'gene_name', 'start', 'end']
keep_gene_start_end = keep_gene_start_end.drop('index', axis=1).drop('gene', axis=1).drop('chr', axis=1)
#keep SNPs in snpfile only if they are w/in 1 Mb of start/end of sig genes
keep_SNP = []
test_SNPs_snpfile = set(snpfile['bp'].tolist()) #is this faster than having nested loops for a dataframe?
if len(sig_genes) == 0:
for SNP in iter(test_SNPs_snpfile):
for keep_gene_start_end_row in keep_gene_start_end.itertuples():
keep_SNP.append(SNP)
keep_SNP = pd.DataFrame(keep_SNP)
keep_SNP.columns = ['bp']
keep_SNP = keep_SNP.merge(snpfile, on = 'bp', how = 'left')
keep_SNP = keep_SNP.sort_values('bp').drop_duplicates()
print("Kept all SNPs from an original " + str(len(snpfile)) + " SNPs from the snpfile.")
else:
for SNP in iter(test_SNPs_snpfile):
for keep_gene_start_end_row in keep_gene_start_end.itertuples():
if (SNP > (keep_gene_start_end_row[2] - 1000000)) and (SNP < (keep_gene_start_end_row[3] + 1000000)): #if base pair of known SNP is w/in 1 Mb of start or end of gene
keep_SNP.append(SNP)
keep_SNP = pd.DataFrame(keep_SNP)
keep_SNP.columns = ['bp']
keep_SNP = keep_SNP.merge(snpfile, on = 'bp', how = 'left')
keep_SNP = keep_SNP.sort_values('bp').drop_duplicates()
print("Kept " + str(len(keep_SNP)) + " SNPs from an original " + str(len(snpfile)) + " SNPs from the snpfile.")
#prune local_anc to relevant SNPs (this wasn't very useful in the example data)
#make local anc row bps into a set for speed
keep_local_anc = []
test_local_anc_SNPs = set(local_anc['bp'].tolist()) #for speed purposes only
if len(sig_genes) == 0:
for SNP in iter(test_local_anc_SNPs):
for keep_gene_start_end_row in keep_gene_start_end.itertuples():
keep_local_anc.append(SNP)
else:
for SNP in iter(test_local_anc_SNPs):
for keep_gene_start_end_row in keep_gene_start_end.itertuples():
if (SNP > (keep_gene_start_end_row[2] - 1000000)) and (SNP < (keep_gene_start_end_row[3] + 1000000)): #if base pair of MOSAIC SNP is w/in 1 Mb of start or end of gene
keep_local_anc.append(SNP)
keep_local_anc = pd.DataFrame(keep_local_anc)
keep_local_anc.columns = ['bp']
keep_local_anc = keep_local_anc.merge(local_anc, on = 'bp', how = 'left')
keep_local_anc['bp'] = keep_local_anc['bp'].astype(float)
keep_local_anc['anc'] = keep_local_anc['anc'].astype('category')
#impute SNPs using .ffill()
#RESTRUCTURE SO YOU JUST APPEND INSTEAD OF MAKING A LARGE DATA FRAME
print("Starting SNP ancestry imputation and dosage file.")
anc_dosage_write = open(output_prefix + "_" + str(chr) + ".csv", "a+")
anc_dosage_write.write("IID," + ",".join(keep_SNP['rs'].tolist()) + "\n")
progress_landmarks_ind = np.linspace(0, len(ind_list), 21, dtype = int).tolist()
num_ind = 0
for ind in ind_list: #what part in here takes so long?
imputed_haplotypes = pd.DataFrame(columns=['anc', 'bp', 'haplotype', 'rs'])
#FIRST HAPLOTYPE
ind_haplotype_A = []
hap_A = ind + "_A"
ind_haplotype_A = keep_local_anc.loc[keep_local_anc['haplotype'] == hap_A]
if ind_haplotype_A.empty: #remove from ind_list to prevent further issues
print(hap_A + " did not have sufficient coverage. Skipping haplotype and removing individual from further analyses.")
ind_list.remove(ind)
num_ind = num_ind + 1
continue
#impute ancestry for SNPs
hap_SNP_A = pd.concat([ind_haplotype_A, keep_SNP]).sort_values('bp')
hap_SNP_A['haplotype'] = hap_A
hap_SNP_A['anc'] = hap_SNP_A['anc'].ffill().bfill() #fills NA with the closest value before and after
#try to keep it NA in between flips? but I don't know of a method that does that
#unless I write one I guess
#remove non-SNPs
hap_SNP_A = hap_SNP_A.dropna(how = 'any', axis = 0)
imputed_haplotypes = imputed_haplotypes.append(hap_SNP_A)
#SECOND HAPLOTYPE
ind_haplotype_B = []
hap_B = ind + "_B"
ind_haplotype_B = keep_local_anc.loc[keep_local_anc['haplotype'] == hap_B]
if ind_haplotype_B.empty: #remove from ind_list to prevent further issues
print(hap_B + " did not have sufficient coverage. Skipping haplotype and removing individual from further analyses.")
ind_list.remove(ind)
num_ind = num_ind + 1
continue
#impute ancestry for SNPs
hap_SNP_B = pd.concat([ind_haplotype_B, keep_SNP]).sort_values('bp')
hap_SNP_B['haplotype'] = hap_B
hap_SNP_B['anc'] = hap_SNP_B['anc'].bfill().ffill() #fills NA with the closest value before and after
#try to keep it NA in between flips? but I don't know of a method that does that
#unless I write one I guess
#remove non-SNPs
hap_SNP_B = hap_SNP_B.dropna(how = 'any', axis = 0)
imputed_haplotypes = imputed_haplotypes.append(hap_SNP_B)
ind_anc = hap_SNP_A.set_index('rs').join(hap_SNP_B.set_index('rs'), lsuffix = "_A", rsuffix = "_B")
ind_anc = ind_anc[["anc_A", "anc_B"]]
#now set ancestry "dosages"
#first col is NAT, second is IBS, and third is YRI
anc_dosage = []
for ind_anc_row in ind_anc.itertuples():
if ind_anc_row[1] == "NAT" and ind_anc_row[2] == "NAT":
anc_dosage.append([ind_anc_row[0], "2.0\t0.0\t0.0"]) #both NAT
elif ind_anc_row[1] == "IBS" and ind_anc_row[2] == "IBS":
anc_dosage.append([ind_anc_row[0], "0.0\t2.0\t0.0"]) #both IBS
elif ind_anc_row[1] == "YRI" and ind_anc_row[2] == "YRI":
anc_dosage.append([ind_anc_row[0], "0.0\t0.0\t2.0"]) #both YRI
elif (ind_anc_row[1] == "NAT" and ind_anc_row[2] == "IBS") or (ind_anc_row[1] == "IBS" and ind_anc_row[2] == "NAT"):
anc_dosage.append([ind_anc_row[0], "1.0\t1.0\t0.0"]) #one NAT, one IBS
elif (ind_anc_row[1] == "NAT" and ind_anc_row[2] == "YRI") or (ind_anc_row[1] == "YRI" and ind_anc_row[2] == "NAT"):
anc_dosage.append([ind_anc_row[0], "1.0\t0.0\t1.0"]) #one NAT, one YRI
elif (ind_anc_row[1] == "YRI" and ind_anc_row[2] == "IBS") or (ind_anc_row[1] == "IBS" and ind_anc_row[2] == "YRI"):
anc_dosage.append([ind_anc_row[0], "0.0\t1.0\t1.0"]) #one IBS, one YRI
else:
anc_dosage.append([ind_anc_row[0], "NA\tNA\tNA"]) #who knows
anc_dosage_df = pd.DataFrame(anc_dosage)
anc_dosage_df.columns = ["rs", ind]
anc_dosage_df = anc_dosage_df.drop_duplicates()
anc_dosage_list = anc_dosage_df[ind].tolist()
anc_dosage_write.write(ind + "," + ",".join(anc_dosage_list) + "\n")
num_ind = num_ind + 1
if num_ind in set(progress_landmarks_ind): #print progress by 5% increments
progress = progress_landmarks_ind.index(num_ind)
print("SNP ancestry covariate conversion is " + str(progress * 5) + "% complete.")
#write list of SNPs to use in GEMMA (-snps)
anc_dosage_write.close()
#make list of inds to use in future analyses
os.system("cut -d, -f1 " + output_prefix + "_" + str(chr) + ".csv > " + output_prefix + "_" + str(chr) + "_ind.txt")
os.system("sed '1d' " + output_prefix + "_" + str(chr) + "_ind.txt > tmpfile.txt; mv tmpfile.txt " + output_prefix + "_" + str(chr) + "_ind.txt")
print("Completed writing individual and SNP ancestry file to " + output_prefix + "_" + str(chr) + "_ind.txt, and " + output_prefix + "_" + str(chr) + ".csv. Have a nice day!")