In [58]:
#Importing important modules
import pandas as pd
import numpy as np
import Bio
import os
from Bio import Entrez, SeqIO
import itertools
import argparse
import math
from Bio import Entrez
import xmltodict
from pprint import pprint
import torch
from torch import nn
import h5py
from Multi_specto_class import *
from Multi_specto_funcs import *
Entrez.email  = "pradluzog@gmail.com"
Entrez.api_key = "98ad62666b4bd2dc831f1824727d74d67c08"

class SNP:
    
    def __init__(self,rsid,position,chromosome):
        self.rsid = rsid
        self.position = position
        self.chr = chromosome
    

        
    def check_ld_snps(self,dataset,window = 1000):
        start_position = self.position - window + 1
        end_position = self.position + window
        dataset = dataset[dataset['Chromosome'] == self.chr]
        def extract_neighbour_snps(start_position, end_position, dataset):
            neighbour_snps = []
            for index,row in dataset.iterrows():
                if start_position <= dataset.loc[index,'Position'] <= end_position:
                    neighbour_snps.append(dataset.loc[index,'MarkerName'])
                else:
                    continue
            return neighbour_snps
    
        self.snps_in_window = extract_neighbour_snps(start_position,end_position,dataset)
        return self.snps_in_window
    
    def obtain_snp_sequence(self,window = 1000):
        start_position = self.position - window +1
        end_position = self.position + window
        if int(self.chr) < 10:
            id_chr = "".join(["NC_00000",str(self.chr)])
        else:
            id_chr = "".join(["NC_0000",str(self.chr)])

        handle = Entrez.efetch(db="nucleotide",
                        id = id_chr,
                        rettype = "fasta",
                        strand = 1,
                        seq_start = start_position,
                        seq_stop  = end_position)
        record = SeqIO.read(handle,"fasta")
        self.snp_sequence = str(record.seq)
        return self.snp_sequence
    
    def obtain_all_comb_seq(self,dataset,sign_num = 'null', window = 1000):
        
        def all_snp_combinations(a):
            combinations = []
            for k in range(0,len(a)):
                t = list(itertools.combinations(a,k+1))
                combinations.extend(t)
            return combinations
        
        self.combinations = all_snp_combinations(self.snps_in_window)
        comb_names = ['_'.join(x) for x in self.combinations if len(x)> 0]
        comb_names.append('_'.join(['Ref',self.rsid]))
        combination_dataset = dataset[dataset['MarkerName'].isin(self.snps_in_window)]
        if sign_num != 'null':
            combination_dataset = combination_dataset.sort_values('Pvalue')
            combination_dataset = combination_dataset.iloc[0:int(sign_num),:]
        sequences = []
        
        for comb in self.combinations:
            seq_to_change = self.snp_sequence
            start_position = self.position - window + 1
            end_position = self.position + window
            for k in range(0,len(comb)):
                idx = combination_dataset['MarkerName'] == comb[k]
                pos = combination_dataset.loc[idx,'Position']
                allele = str(combination_dataset.loc[idx,'Non_Effect_allele'].values[0])
                net_pos = int(pos) - int(start_position)
                seq_to_change = seq_to_change[:net_pos-1] + allele + seq_to_change[net_pos:]
            sequences.append(seq_to_change)
        sequences.append(self.snp_sequence)
        sequences_named = dict(zip(comb_names,sequences))
        return sequences_named
                
                
    def seq_combination(self,dataset,sign_num = 'null',window = 1000):
        self.check_ld_snps(dataset,window)
        self.obtain_snp_sequence()
        self.combination_seq = self.obtain_all_comb_seq(dataset,sign_num,window)
        return self.combination_seq
        
    
    def __str__(self):
        return "The SNP in object is "+self.rsid

In [59]:
het_file = pd.read_csv('phased_rs429358_196608bp_het.csv')
snp_het_file = pd.read_csv('phased_rs429358_196608bp_het_snp_names.csv')
snp_het_file

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,19,44810411,rs79789351,T,C,,,AC=104;AF=0.0643564;CM=0
1,19,44810426,,G,C,,,AC=2;AF=0.00123762;CM=4.67052e-05
2,19,44810476,rs148392215,C,T,,,AC=1;AF=0.000618812;CM=0.000204432
3,19,44810483,rs143434288,G,A,,,AC=1;AF=0.000618812;CM=0.000227023
4,19,44810485,rs147158872,T,C,,,AC=21;AF=0.012995;CM=0.000233477
...,...,...,...,...,...,...,...,...
3537,19,45006751,,A,T,,,AC=3;AF=0.00185644;CM=0.498073
3538,19,45006823,rs183825584,C,T,,,AC=1;AF=0.000618812;CM=0.498296
3539,19,45006841,,T,C,,,AC=1;AF=0.000618812;CM=0.498352
3540,19,45006863,,T,C,,,AC=1;AF=0.000618812;CM=0.49842


In [46]:
subject_ids = het_file.columns
main_seq_rs429358_196608bp = SNP('rs429358',44908684,19)
main_seq_rs429358_196608bp = main_seq_rs429358_196608bp.obtain_snp_sequence(window = 98304)

In [47]:
subjects = {}
for subj in subject_ids:
    Pat = main_seq_rs429358_196608bp
    Met = main_seq_rs429358_196608bp
    print("Running %s" %(subj))
    het_info = het_file.loc[:,subj]
    for i in range(0,len(het_file)):
        snp_pos = int(snp_het_file.loc[i,'POS']) - 44810381 + 1
        P,M = het_info[i].split('|')
        if P == '1':
            Pat = Pat[:snp_pos-1] + str(snp_het_file.loc[i,'ALT']) + Pat[snp_pos:]
        else:
            Pat = Pat[:snp_pos-1] + str(snp_het_file.loc[i,'REF']) + Pat[snp_pos:]
        if M == '1':
            Met = Met[:snp_pos-1] + str(snp_het_file.loc[i,'ALT']) + Met[snp_pos:]
        else:
            Met = Met[:snp_pos-1] + str(snp_het_file.loc[i,'REF']) + Met[snp_pos:]        
            
    subjects[subj] = {'P':Pat,'M': Met}
    print("Completed %s" %(subj))

Running 003_S_1057
Completed 003_S_1057
Running 003_S_0908
Completed 003_S_0908
Running 003_S_1122
Completed 003_S_1122
Running 136_S_0695
Completed 136_S_0695
Running 136_S_0873
Completed 136_S_0873
Running 130_S_0886
Completed 130_S_0886
Running 012_S_1133
Completed 012_S_1133
Running 003_S_1074
Completed 003_S_1074
Running 037_S_0501
Completed 037_S_0501
Running 027_S_0074
Completed 027_S_0074
Running 031_S_0351
Completed 031_S_0351
Running 051_S_1072
Completed 051_S_1072
Running 052_S_0951
Completed 052_S_0951
Running 016_S_1117
Completed 016_S_1117
Running 002_S_2010
Completed 002_S_2010
Running 128_S_2002
Completed 128_S_2002
Running 127_S_0622
Completed 127_S_0622
Running 014_S_0658
Completed 014_S_0658
Running 014_S_0558
Completed 014_S_0558
Running 002_S_0729
Completed 002_S_0729
Running 128_S_2003
Completed 128_S_2003
Running 127_S_0684
Completed 127_S_0684
Running 031_S_2022
Completed 031_S_2022
Running 072_S_2026
Completed 072_S_2026
Running 014_S_0520
Completed 014_S_0520


Completed 052_S_1250
Running 073_S_2225
Completed 073_S_2225
Running 941_S_1203
Completed 941_S_1203
Running 023_S_1190
Completed 023_S_1190
Running 006_S_1130
Completed 006_S_1130
Running 123_S_1300
Completed 123_S_1300
Running 129_S_2332
Completed 129_S_2332
Running 031_S_0294
Completed 031_S_0294
Running 072_S_0315
Completed 072_S_0315
Running 072_S_4007
Completed 072_S_4007
Running 073_S_2264
Completed 073_S_2264
Running 005_S_0448
Completed 005_S_0448
Running 116_S_1271
Completed 116_S_1271
Running 072_S_1380
Completed 072_S_1380
Running 020_S_1288
Completed 020_S_1288
Running 052_S_1352
Completed 052_S_1352
Running 130_S_0232
Completed 130_S_0232
Running 109_S_2278
Completed 109_S_2278
Running 099_S_0352
Completed 099_S_0352
Running 027_S_2336
Completed 027_S_2336
Running 029_S_1318
Completed 029_S_1318
Running 130_S_0285
Completed 130_S_0285
Running 052_S_1346
Completed 052_S_1346
Running 098_S_0269
Completed 098_S_0269
Running 022_S_1351
Completed 022_S_1351
Running 037_S_0467


Completed 033_S_4177
Running 033_S_4179
Completed 033_S_4179
Running 116_S_4195
Completed 116_S_4195
Running 123_S_4170
Completed 123_S_4170
Running 016_S_2284
Completed 016_S_2284
Running 141_S_0767
Completed 141_S_0767
Running 123_S_2055
Completed 123_S_2055
Running 005_S_4168
Completed 005_S_4168
Running 037_S_0539
Completed 037_S_0539
Running 051_S_1123
Completed 051_S_1123
Running 116_S_4209
Completed 116_S_4209
Running 002_S_4213
Completed 002_S_4213
Running 027_S_0835
Completed 027_S_0835
Running 116_S_4199
Completed 116_S_4199
Running 031_S_4194
Completed 031_S_4194
Running 031_S_4203
Completed 031_S_4203
Running 082_S_4208
Completed 082_S_4208
Running 068_S_4174
Completed 068_S_4174
Running 127_S_4198
Completed 127_S_4198
Running 022_S_4173
Completed 022_S_4173
Running 006_S_4150
Completed 006_S_4150
Running 073_S_4216
Completed 073_S_4216
Running 094_S_4162
Completed 094_S_4162
Running 023_S_4035
Completed 023_S_4035
Running 031_S_0830
Completed 031_S_0830
Running 099_S_4157


Completed 128_S_2130
Running 022_S_4291
Completed 022_S_4291
Running 029_S_4279
Completed 029_S_4279
Running 013_S_4395
Completed 013_S_4395
Running 135_S_4446
Completed 135_S_4446
Running 003_S_4354
Completed 003_S_4354
Running 016_S_4353
Completed 016_S_4353
Running 072_S_4391
Completed 072_S_4391
Running 130_S_4415
Completed 130_S_4415
Running 941_S_4376
Completed 941_S_4376
Running 018_S_2133
Completed 018_S_2133
Running 114_S_0378
Completed 114_S_0378
Running 003_S_4373
Completed 003_S_4373
Running 130_S_4405
Completed 130_S_4405
Running 032_S_4386
Completed 032_S_4386
Running 141_S_4426
Completed 141_S_4426
Running 003_S_4441
Completed 003_S_4441
Running 130_S_4417
Completed 130_S_4417
Running 002_S_4447
Completed 002_S_4447
Running 126_S_4458
Completed 126_S_4458
Running 072_S_4445
Completed 072_S_4445
Running 116_S_4453
Completed 116_S_4453
Running 072_S_4394
Completed 072_S_4394
Running 022_S_4266
Completed 022_S_4266
Running 114_S_4404
Completed 114_S_4404
Running 141_S_4438


In [50]:
subjects['003_S_1057']['M']

'GGCAGGTCTCAGCCAGCCTGTCCCTCCCACTTTCCTGAAACCTGCGCAGCCCTGTCCCCCCTCACCGCCCCCACACATCCATCCCGCCTAAGCTCCGGGCCCGGTCGCAGCCCAACACCCCAGCCCCTCACTCATGGGAGAGCGGAAGAGACTCAGGGCCTGCCGGTCTGAGTGGTGCAGGTGCAGAGAGAGGCAGAGACAGGGACAGAGATGGAGACCCAAGAGGAGAGACACACGCAAAGGTGGAGACCAGGATGGAGAGAAACATTTATCCATCATCACAGAGGTGGCCCTGCCCAGGGTGCTGGAGGCAAATGGCTGAGGCAGGCCTGGCCCTGCCCTGGGGGGTGGCGTGTGCTCTGTTGGGATAAGCAAAGGCTGGTGCAGTATTACCCCAGGAGTGTTCAGCGGAGGCCACAGGGCCCTGTGCAGCCCAGAGGAGGAGCCCTGAGCTCTGCAAGTGCAGGGAGAGATGGATTTGAGACCCAGAGATGCAGGGAGAAGGGATGGAGGGCGAAAGGATCTGAGGGAGGAGGGGCTGGGGCCTAGACTCCTGGGTCTGAGGGAGGAGGGGCTGGGGCCTGGACTCCTGGGTCTGAGGGAGGAGGGGCTGGGGCCTTGACTCCTGGGTCTGAGGGAGGAGGGGCTGGGGCCTAGACTCCTGAGTCTGAGGGAGGAGGGGACAGGGGCCCAGATTCCTGGGTCTGAGGGAGGAGGGGCTGGCACCTGGACTCCTGGGTCTGAGGGAGGAGGGGCTGGGACCTGGACGCTTGTGTCCTGGGGGAGAGGGGACCCTGTCTGGAGGGCTCTTCCTGTTGGTGTGGTGGATGATAGCTCAGTTGCTCTCTTGCAGATGCCCAGGCGGAGGTGCGCTTGTCTGTACCCCCGCTGGTGGAGGTGATGCGAGGAAAGTCTGTCATTCTGGAGTGCACCCCTACGGGAACCCACGACCATTATATGCTGGAATGGTTCCTTGTGAGCGCTTGGGGCTGGGGGGCC

In [54]:
seq_info = pd.DataFrame.from_dict(subjects, orient = 'index')

In [57]:
seq_info.to_csv('Sequences_phased_rs429358_196608bp.csv')

In [102]:
#igap = pd.read_csv('IGAP_stage_1.txt',sep='\t')
igap_filtered_snps = igap[igap['MarkerName'].isin(snp_filtered)]
igap_filtered_snps.to_csv('IGAP_Filtered_SNPs_for')

In [83]:
snps = snp_het_file['ID']
snps = [snp for snp in snps if 'rs' in str(snp)]

In [97]:
snp_filtered = [snp for snp in snps if 'rs' in str(snp)]

In [103]:
igap_filtered_snps

Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue
6663292,19,44810411,rs79789351,C,T,0.1386,0.0583,0.01740
6663293,19,44810587,rs2722708,G,A,-0.0235,0.0234,0.31650
6663294,19,44810704,rs2609883,A,G,-0.0234,0.0234,0.31700
6663295,19,44813582,rs77701217,T,C,0.0044,0.0408,0.91360
6663296,19,44814100,rs898175,C,T,-0.0179,0.0243,0.46060
...,...,...,...,...,...,...,...,...
6664029,19,45004823,rs2293166,A,G,0.0377,0.0157,0.01659
6664030,19,45005892,rs55882928,G,A,0.0140,0.0201,0.48390
6664031,19,45006534,rs11668999,A,T,0.0150,0.0200,0.45400
6664032,19,45006578,rs76381504,A,C,-0.0685,0.0566,0.22640
