In [12]:
# Non-included packages
!pip install biopython



In [13]:
# Imports
import pandas as pd
import numpy as np
import re

#Tools
from sklearn.preprocessing import MinMaxScaler
from Bio import SeqIO

#Configuration
%matplotlib inline

In [None]:
# Check running in colab
try:
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False

In [14]:
#Configure colab vs local
if (IN_COLAB == True) :
  drive.mount('/content/drive', force_remount=True)
  root_dir = "/content/drive/My Drive/"
else:
  root_dir = "./"

Mounted at /content/drive


In [15]:
# Load hotspots
# Only hotspots of up to length 1,500 bp were taken
# Those that were shorter than 1,500 bp were padded with N's

hotspots = list(SeqIO.parse(root_dir + "Data/hotspots/fasta/" + "combined-max-1500-padded-REMOVED-BAD.fasta", "fasta"))

# These random sequences were generated from all parts of the human genome
# They were generated with the exact length profile of the hotspots
# i.e. hotspots and sequences have the same amount of padding, which is up to 1,500 bps

nohotspots = list(SeqIO.parse(root_dir + "Data/hotspots/fasta/" + "sample-max-1500-padded-REMOVED-BAD.fasta", "fasta"))

In [16]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=5):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

# Which poly's we will be looking for
interesting_polys = ['AAAAAAAAAAAA', 'TTTTTTTTTTTT', 'TGTGTGTGTGTG', 'GTGTGTGTGTGT', 'CACACACACACA', 'ACACACACACAC',
                     'ATATATATATAT', 'TATATATATATA', 'TTAAAAAAAAAA', 'TTTTTTTTTTAA', 'CTGTAATCCCAG', 'CCTGTAATCCCA',
                     'CTGGGATTACAG', 'TGGGATTACAGG', 'TGTAATCCCAGC', 'CCTCAGCCTCCC', 'GCTGGGATTACA', 'GGGAGGCTGAGG',
                     'CCTTTTTTTTTT', 'AAAAAAAAAAGG', 'AAAAAAAGAAAG', 'CTTTCTTTTTTT', 'TAAAAATAAAAA', 'TTTTTATTTTTA',
                     'CCAAAAAAAAAA', 'GCCTCAGCCTCC', 'TTTTTTTTTTGG', 'CTTTTTTTTTTG', 'CAAAAAAAAAAG', 'GGAGGCTGAGGC' ]

# Function to find different poly's in a hotspot and add them as features
def compute_polys(string):
    matched_polys = np.zeros(len(interesting_polys))
    for idx, poly in enumerate(interesting_polys):
        #Sum of found polys 
        """
        res = len(re.findall(poly, string))
        matched_polys[idx] = res
        """

        #Polys at 1 if found
        found = string.find(poly)
        if (found != -1):
            matched_polys[idx] = 1
        
    return matched_polys

def compute_hash(string):
    hashv = 0
    value = {"a":0, "c":1, "g":2, "t":3}
    i = len(string)-1
    for nucl in string:
        if(nucl == 'n'): return -1
        hashv = hashv + (4**i) * value[nucl]
        i = i - 1
    return hashv

In [17]:
# Definitions on data
k = 5 # Kmer size
n_seqs = len(hotspots)
npolys = len(interesting_polys)
nmers = 4**k
LEN_ITEM = 1500 # length of a hotspot / no hotspot

In [19]:
hotspots_vector = np.zeros((n_seqs*(4**k + npolys))).reshape(n_seqs, (4**k + npolys))
nohotspots_vector = np.zeros((n_seqs*(4**k + npolys))).reshape(n_seqs, (4**k + npolys))

for i, seq_record in enumerate(hotspots):
    for kmer in getKmers(seq_record.seq, size=k):
        hashv = compute_hash(kmer)
        if(hashv > -1): hotspots_vector[i, hashv] = hotspots_vector[i, hashv] + 1
    hotspots_vector[i, nmers:] = compute_polys(str(seq_record.seq))

for i, seq_record in enumerate(nohotspots):
    for kmer in getKmers(seq_record.seq, size=k):
        hashv = compute_hash(kmer)
        if(hashv > -1): nohotspots_vector[i, hashv] = nohotspots_vector[i, hashv] + 1
    nohotspots_vector[i, nmers:] = compute_polys(str(seq_record.seq))

In [20]:
labels_hotspots = np.zeros(len(hotspots_vector))
labels_nohotspots = np.ones(len(nohotspots_vector))

In [21]:
scaler = MinMaxScaler()
merged_dataset = np.concatenate([hotspots_vector, nohotspots_vector])
hotspots = scaler.fit_transform(merged_dataset)
labels = np.concatenate([labels_hotspots, labels_nohotspots])

print(len(hotspots))
print(len(hotspots[0]))

77168
94


In [22]:
np.save(root_dir + "Data/hotspots/fasta/hotspots-5k-1polys",hotspots)
np.save(root_dir + "Data/hotspots/fasta/labels_hotspots-5k-1polys",labels)