# Expecto Model with multiple variants in input

Working on the Expect model but with each input sequence having a combination of all possible SNPs inside the LD block

## Working to get sequences by building SNP class

In [1]:
#Importing important modules
import pandas as pd
import numpy as np
import Bio
import os
from Bio import Entrez, SeqIO
import itertools
import argparse
import math
from Bio import Entrez
import xmltodict
from pprint import pprint
import torch
from torch import nn
import h5py
from Multi_specto_class import *
from Multi_specto_funcs import *
Entrez.email  = "pradluzog@gmail.com"
Entrez.api_key = "98ad62666b4bd2dc831f1824727d74d67c08"

In [6]:
#Reading IGAP dataset

igap = pd.read_csv('IGAP_stage_1.txt', sep='\t')
#for my mac
#igap = pd.read_csv('~/Downloads/IGAP_stage_1.txt', sep = '\t')

In [7]:
#Filtering the igap snps 
igap = igap.sort_values(by = ['Pvalue'], ascending=True)
top_10_igap_snps = igap.iloc[0:20,:]
top_10_igap_snps

Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue
6665069,19,45394969,rs184017,G,T,0.9704,0.0208,0.0
6665078,19,45396665,rs59007384,T,G,0.9812,0.0208,0.0
6665060,19,45388500,rs283811,G,A,0.9726,0.0218,0.0
6665154,19,45427125,rs111789331,A,T,1.3811,0.0321,0.0
6665058,19,45388130,rs34342646,A,G,1.1044,0.0246,0.0
6665057,19,45387596,rs12972970,A,G,1.1072,0.0249,0.0
6665056,19,45387459,rs12972156,G,C,1.1399,0.0256,0.0
6665131,19,45415713,rs10414043,A,G,1.2958,0.0265,0.0
6665112,19,45406673,rs10119,A,G,0.822,0.0208,0.0
6665148,19,45422946,rs41377151,G,A,1.3511,0.0317,0.0


In [8]:

for index,row in top_10_igap_snps.iterrows():
    response = Entrez.efetch(db='SNP', id=str(top_10_igap_snps.loc[index,'MarkerName'])).read()
    response = response[:-1]
    response_o = xmltodict.parse(response)
    pos = response_o['DocumentSummary']['CHRPOS']
    pos = pos.split(':')[1]
    top_10_igap_snps.loc[index,'Position'] = int(pos)

top_10_igap_snps

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue
6665069,19,44891712,rs184017,G,T,0.9704,0.0208,0.0
6665078,19,44893408,rs59007384,T,G,0.9812,0.0208,0.0
6665060,19,44885243,rs283811,G,A,0.9726,0.0218,0.0
6665154,19,44923868,rs111789331,A,T,1.3811,0.0321,0.0
6665058,19,44884873,rs34342646,A,G,1.1044,0.0246,0.0
6665057,19,44884339,rs12972970,A,G,1.1072,0.0249,0.0
6665056,19,44884202,rs12972156,G,C,1.1399,0.0256,0.0
6665131,19,44912456,rs10414043,A,G,1.2958,0.0265,0.0
6665112,19,44903416,rs10119,A,G,0.822,0.0208,0.0
6665148,19,44919689,rs41377151,G,A,1.3511,0.0317,0.0


In [9]:
#Defining a SNP class to perform simple LD filtering duties
class SNP:
    
    def __init__(self,rsid,position,chromosome):
        self.rsid = rsid
        self.position = position
        self.chr = chromosome
    

        
    def check_ld_snps(self,dataset,window = 1000):
        start_position = self.position - window + 1
        end_position = self.position + window
        dataset = dataset[dataset['Chromosome'] == self.chr]
        def extract_neighbour_snps(start_position, end_position, dataset):
            neighbour_snps = []
            for index,row in dataset.iterrows():
                if start_position <= dataset.loc[index,'Position'] <= end_position:
                    neighbour_snps.append(dataset.loc[index,'MarkerName'])
                else:
                    continue
            return neighbour_snps
    
        self.snps_in_window = extract_neighbour_snps(start_position,end_position,dataset)
        return self.snps_in_window
    
    def obtain_snp_sequence(self,window = 1000):
        start_position = self.position - window +1
        end_position = self.position + window
        if int(self.chr) < 10:
            id_chr = "".join(["NC_00000",str(self.chr)])
        else:
            id_chr = "".join(["NC_0000",str(self.chr)])

        handle = Entrez.efetch(db="nucleotide",
                        id = id_chr,
                        rettype = "fasta",
                        strand = 1,
                        seq_start = start_position,
                        seq_stop  = end_position)
        record = SeqIO.read(handle,"fasta")
        self.snp_sequence = str(record.seq)
        return self.snp_sequence
    
    def obtain_all_comb_seq(self,dataset,sign_num = 'null', window = 1000):
        
        def all_snp_combinations(a):
            combinations = []
            for k in range(0,len(a)):
                t = list(itertools.combinations(a,k+1))
                combinations.extend(t)
            return combinations
        
        self.combinations = all_snp_combinations(self.snps_in_window)
        comb_names = ['_'.join(x) for x in self.combinations if len(x)> 0]
        comb_names.append('_'.join(['Ref',self.rsid]))
        combination_dataset = dataset[dataset['MarkerName'].isin(self.snps_in_window)]
        if sign_num != 'null':
            combination_dataset = combination_dataset.sort_values('Pvalue')
            combination_dataset = combination_dataset.iloc[0:int(sign_num),:]
        sequences = []
        
        for comb in self.combinations:
            seq_to_change = self.snp_sequence
            start_position = self.position - window + 1
            end_position = self.position + window
            for k in range(0,len(comb)):
                idx = combination_dataset['MarkerName'] == comb[k]
                pos = combination_dataset.loc[idx,'Position']
                allele = str(combination_dataset.loc[idx,'Non_Effect_allele'].values[0])
                net_pos = int(pos) - int(start_position)
                seq_to_change = seq_to_change[:net_pos-1] + allele + seq_to_change[net_pos:]
            sequences.append(seq_to_change)
        sequences.append(self.snp_sequence)
        sequences_named = dict(zip(comb_names,sequences))
        return sequences_named
                
                
    def seq_combination(self,dataset,sign_num = 'null',window = 1000):
        self.check_ld_snps(dataset,window)
        self.obtain_snp_sequence()
        self.combination_seq = self.obtain_all_comb_seq(dataset,sign_num,window)
        return self.combination_seq
        
    
    def __str__(self):
        return "The SNP in object is "+self.rsid
        
        
        
        
        

## Remodelling Chromatin.py from Expecto

In [10]:
#Important library calls
import argparse
import math
import torch
from torch import nn
import numpy as np
import pandas as pd
import h5py

In [11]:
#Inputing the resources for Expect.py
inputsize = 2000
batchSize = 32
maxshift = 800
args_cuda = False

In [12]:
#DL model
class LambdaBase(nn.Sequential):
    def __init__(self, fn, *args):
        super(LambdaBase, self).__init__(*args)
        self.lambda_func = fn

    def forward_prepare(self, input):
        output = []
        for module in self._modules.values():
            output.append(module(input))
        return output if output else input

class Lambda(LambdaBase):
    def forward(self, input):
        return self.lambda_func(self.forward_prepare(input))

class Beluga(nn.Module):
    def __init__(self):
        super(Beluga, self).__init__()
        self.model = nn.Sequential(
            nn.Sequential(
                nn.Conv2d(4,320,(1, 8)),
                nn.ReLU(),
                nn.Conv2d(320,320,(1, 8)),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.MaxPool2d((1, 4),(1, 4)),
                nn.Conv2d(320,480,(1, 8)),
                nn.ReLU(),
                nn.Conv2d(480,480,(1, 8)),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.MaxPool2d((1, 4),(1, 4)),
                nn.Conv2d(480,640,(1, 8)),
                nn.ReLU(),
                nn.Conv2d(640,640,(1, 8)),
                nn.ReLU(),
            ),
            nn.Sequential(
                nn.Dropout(0.5),
                Lambda(lambda x: x.view(x.size(0),-1)),
                nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(67840,2003)),
                nn.ReLU(),
                nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(2003,2002)),
            ),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.model(x)



def encodeSeqs(seqs, inputsize=2000):
    """Convert sequences to 0-1 encoding and truncate to the input size.
    The output concatenates the forward and reverse complement sequence
    encodings.
    Args:
        seqs: list of sequences (e.g. produced by fetchSeqs)
        inputsize: the number of basepairs to encode in the output
    Returns:
        numpy array of dimension: (2 x number of sequence) x 4 x inputsize
    2 x number of sequence because of the concatenation of forward and reverse
    complement sequences.
    """
    seqsnp = np.zeros((len(seqs), 4, inputsize), np.bool_)

    mydict = {'A': np.asarray([1, 0, 0, 0]), 'G': np.asarray([0, 1, 0, 0]),
            'C': np.asarray([0, 0, 1, 0]), 'T': np.asarray([0, 0, 0, 1]),
            'N': np.asarray([0, 0, 0, 0]), 'H': np.asarray([0, 0, 0, 0]),
            'a': np.asarray([1, 0, 0, 0]), 'g': np.asarray([0, 1, 0, 0]),
            'c': np.asarray([0, 0, 1, 0]), 't': np.asarray([0, 0, 0, 1]),
            'n': np.asarray([0, 0, 0, 0]), '-': np.asarray([0, 0, 0, 0])}

    n = 0
    for line in seqs:
        cline = line[int(math.floor(((len(line) - inputsize) / 2.0))):int(math.floor(len(line) - (len(line) - inputsize) / 2.0))]
        for i, c in enumerate(cline):
            seqsnp[n, :, i] = mydict[c]
        n = n + 1

    # get the complementary sequences
    dataflip = seqsnp[:, ::-1, ::-1]
    seqsnp = np.concatenate([seqsnp, dataflip], axis=0)
    return seqsnp

def get_predicted_diff(snp_comb_seq,inputsize = 2000, batchSize = 32, maxshift = 800, args_cuda = False):
    """
    Function to obtain all the predicted chromatin values for reference and alterante 
    and find the difference among them for further analysis.
    Args:
        snp_comb_seq: A dictionary of sequences as string object with A,T,G,C characters
                        and keys corresponding to snps and combinations of snps with atleast
                        one snp having 'Ref' in the key name to denote reference variant
    Return:
            A dictionary of matrix size 4000x2002 for the chromatin difference values for each 
            variant and combination except the reference
    """
    refseqs = [seq for key, seq in snp_comb_seq.items() if 'ref' in key.lower()]
    ref_encoded = encodeSeqs(refseqs, inputsize=inputsize).astype(np.float32)

    ref_preds = []
    for i in range(int(1 + (ref_encoded.shape[0]-1) / batchSize)):
        input = torch.from_numpy(ref_encoded[int(i*batchSize):int((i+1)*batchSize),:,:]).unsqueeze(2)
        if args_cuda:
            input = input.cuda()
        ref_preds.append(model.forward(input).cpu().detach().numpy().copy())
    ref_preds = np.vstack(ref_preds)
    
    comb_diff_pred = {}
    for comb_seq in snp_comb_seq.keys():

        if('Ref' not in comb_seq):

            altseqs = [snp_comb_seq[comb_seq]]
            alt_encoded = encodeSeqs(altseqs, inputsize=inputsize).astype(np.float32)

            alt_preds = []
            for i in range(int(1 + (alt_encoded.shape[0]-1) / batchSize)):
                input = torch.from_numpy(alt_encoded[int(i*batchSize):int((i+1)*batchSize),:,:]).unsqueeze(2)
                if args_cuda:
                    input = input.cuda()
                alt_preds.append(model.forward(input).cpu().detach().numpy().copy())
            alt_preds = np.vstack(alt_preds)

            diff = np.log2(ref_preds/(1-ref_preds)) - np.log2(alt_preds/(1-alt_preds)) 
            comb_diff_pred[comb_seq] = diff
    
    
    return comb_diff_pred

In [13]:
model = Beluga()
model.load_state_dict(torch.load('deepsea.beluga.pth'))
model.eval()
#model.cuda()


Beluga(
  (model): Sequential(
    (0): Sequential(
      (0): Conv2d(4, 320, kernel_size=(1, 8), stride=(1, 1))
      (1): ReLU()
      (2): Conv2d(320, 320, kernel_size=(1, 8), stride=(1, 1))
      (3): ReLU()
      (4): Dropout(p=0.2, inplace=False)
      (5): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(320, 480, kernel_size=(1, 8), stride=(1, 1))
      (7): ReLU()
      (8): Conv2d(480, 480, kernel_size=(1, 8), stride=(1, 1))
      (9): ReLU()
      (10): Dropout(p=0.2, inplace=False)
      (11): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=0, dilation=1, ceil_mode=False)
      (12): Conv2d(480, 640, kernel_size=(1, 8), stride=(1, 1))
      (13): ReLU()
      (14): Conv2d(640, 640, kernel_size=(1, 8), stride=(1, 1))
      (15): ReLU()
    )
    (1): Sequential(
      (0): Dropout(p=0.5, inplace=False)
      (1): Lambda()
      (2): Sequential(
        (0): Lambda()
        (1): Linear(in_features=67840, out_features

In [15]:
"""
comb_diff_pred = get_predicted_diff(snp_comb_seq)
f = h5py.File(snp_test +'.diff.h5', 'w')
key_names = list(comb_diff_pred.keys())
for i in key_names:
    f.create_dataset(i, data=comb_diff_pred[i])
f.close()
"""

"\ncomb_diff_pred = get_predicted_diff(snp_comb_seq)\nf = h5py.File(snp_test +'.diff.h5', 'w')\nkey_names = list(comb_diff_pred.keys())\nfor i in key_names:\n    f.create_dataset(i, data=comb_diff_pred[i])\nf.close()\n"

## Running chromatin prediction for top 10 SNPs and its combinations with LD blocks

In [14]:
skip_snps = ['rs59007384','rs111789331']
apoe_snp = ['rs429358']
igap_19 = pd.read_csv('IGAP_Chr19_Hg38loc.csv')
igap_19

Unnamed: 0.1,Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue
0,6552873,19,252541,rs12460337,A,G,0.0098,0.0302,0.7444
1,6552874,19,253938,rs7247199,A,G,-0.0090,0.0216,0.6783
2,6552875,19,254304,rs3929173,A,G,-0.0099,0.0212,0.6414
3,6552876,19,254447,rs1975527,G,A,0.0207,0.0241,0.3904
4,6552877,19,254448,rs1975528,T,C,-0.0203,0.0235,0.3878
...,...,...,...,...,...,...,...,...,...
151294,6704167,19,58588397,rs149705456,G,A,0.0049,0.0184,0.7880
151295,6704168,19,58590603,rs111790549,A,G,-0.0244,0.0455,0.5915
151296,6704169,19,58591012,rs79349307,G,A,-0.0171,0.0227,0.4507
151297,6704170,19,190188248,rs71223699,G,A,-0.0502,0.0556,0.3668


In [15]:

for k in range(0,len(top_10_igap_snps)):
    snp_test = top_10_igap_snps.iloc[k,2]
    if snp_test not in skip_snps:
        print("Running %s ...."%(snp_test))
        snp_obj = SNP(top_10_igap_snps.iloc[k,2],top_10_igap_snps.iloc[k,1],top_10_igap_snps.iloc[k,0])
        print("Obtaining combinations for %s ...."%(snp_test))
        snp_comb_seq = snp_obj.seq_combination(igap_19)
        print("Predicting the sequence profiles for %s ...."%(snp_test))
        comb_diff_pred = get_predicted_diff(snp_comb_seq)
        f = h5py.File(snp_test +'.diff.h5', 'w')
        key_names = list(comb_diff_pred.keys())
        print("Saving the result for %s"%(snp_test))
        for i in key_names:
            f.create_dataset(i, data=comb_diff_pred[i])
        f.close()  



Running rs184017 ....
Obtaining combinations for rs184017 ....
Predicting the sequence profiles for rs184017 ....
Saving the result for rs184017
Running rs283811 ....
Obtaining combinations for rs283811 ....
Predicting the sequence profiles for rs283811 ....
Saving the result for rs283811
Running rs34342646 ....
Obtaining combinations for rs34342646 ....
Predicting the sequence profiles for rs34342646 ....
Saving the result for rs34342646
Running rs12972970 ....
Obtaining combinations for rs12972970 ....
Predicting the sequence profiles for rs12972970 ....
Saving the result for rs12972970
Running rs12972156 ....
Obtaining combinations for rs12972156 ....
Predicting the sequence profiles for rs12972156 ....
Saving the result for rs12972156
Running rs10414043 ....
Obtaining combinations for rs10414043 ....
Predicting the sequence profiles for rs10414043 ....
Saving the result for rs10414043
Running rs10119 ....
Obtaining combinations for rs10119 ....
Predicting the sequence profiles for 