In [1]:
# import module SeqIO from library Bio.
from Bio import SeqIO
human_genome = SeqIO.parse("GCA_000001405.28_GRCh38.p13_genomic.fna", "fasta")
for chromosome in human_genome:
    if chromosome.name == "CM000664.2":
        # utf8_sequence is a byte string of lower-cased letters encoded by utf8.
        utf8_sequence = str(chromosome.seq).lower().encode('utf8')

In [3]:
# define the function count_contain_2N.
def count_contain_2N(sequence):
    # use variable count to save the number of total subsequences that do not contain more than 2Ns.
    # use variable count_n to save the number of Ns in each subsequences.
    count = 0
    count_n = 0
    # use two for loop to traverse the whole genome sequence.
    for i in range(0,len(sequence)-14):
        # loop through each 15-mer in the whole genome sequence, if the nucleotide is n, count_n increases by 1.
        for j in range(i, i + 15):
            if sequence[j] == 110:
                count_n = count_n + 1
        # if the number of Ns in the 15-mer less than or equal to 2, count increases by 1.
        if count_n <= 2:
            count = count + 1
            # reset the count_n to zero for a new loop through.
            count_n = 0
        else:
            # reset the count_n to zero for a new loop through.
            count_n = 0
            
    print(f"{count} subsequences do not contain more than 2Ns.")


In [215]:
count_contain_2N(utf8_sequence)

240548031 subsequences do not contain more than 2Ns.


In [10]:
p = 2_549_536_629_329
bits_48 = 2 ** 48 - 1
scale = 0x07ffffffff
from hashlib import sha256
def get_ath_hash(a):
    def my_hash(subseq):
        return (((int(sha256(subseq).hexdigest(), 16) % bits_48) * a) % p) & scale
    return my_hash

In [11]:
# use 1 hash function for a try.
first_hash = get_ath_hash(1)
sample_subsequence = utf8_sequence[0:15]
# set the minimum hash value as the hash value of first 15-mer in the total sequence.
min_h_1 = first_hash(sample_subsequence)

In [51]:
# use for loop to loop through the sequence, find the minimum hash value and save into the variable min_h_1.
for i in range(0,len(utf8_sequence)-14):
    h = first_hash(utf8_sequence[i:i+15])
    if h < min_h_1:
        min_h_1 = h

In [12]:
# scale the minimum hash value into a number between 0 and 1.
scale_min_h_1 = min_h_1/0x07ffffffff
# calculate the number of distinct 15-mers.
distinct_num_15_mer_1 = round((1/scale_min_h_1) - 1)
print(f"The number of distinct 15-mers using 1 hash functions is {distinct_num_15_mer_1}.")

The number of distinct 15-mers using 1 hash functions is 2.


In [13]:
# use 100 hash function.
hundredth_hash = get_ath_hash(100)
# set the minimum hash value as the hash value of first 15-mer in the total sequence.
sample_subsequence = utf8_sequence[0:15]
min_h_100 = hundredth_hash(sample_subsequence)

In [14]:
# use for loop to loop through the sequence, find the minimum hash value and save into the variable min_h_100.
for i in range(0,len(utf8_sequence)-14):
    h = hundredth_hash(utf8_sequence[i:i+15])
    if h < min_h_100:
        min_h_100 = h

In [15]:
# scale the minimum hash value into a number between 0 and 1.
scale_min_h_100 = min_h_100/0x07ffffffff
# calculate the number of distinct 15-mers.
distinct_num_15_mer_100 = round((1/scale_min_h_100) - 1)
print(f"The number of distinct 15-mers using 100 hash functions is {distinct_num_15_mer_100}.")

The number of distinct 15-mers using 100 hash functions is 429496729.


In [5]:
# import library numpy.
import numpy as np

In [16]:
# define the function calculate_min_h(), which is used to return the median of the minimum hashes.
def calculate_min_h(sequence,a):
    # create list min_h_list, to save the values of minimum hashes.
    min_h_list = []
    for i in range(1,a+1):
        each_hash = get_ath_hash(i)
        # randomly set the value of min_h.
        min_h = each_hash(sequence[0:15])
        # use for loop to find the minimum hash value, save into the variable min_h.
        for j in range(0,len(sequence)-14):
            h = each_hash(sequence[j:j+15])
            if h < min_h:
                min_h = h
        # save all the minimum hash values into the list min_h_list.
        min_h_list.append(min_h)
    # return the median of the minimum hash values
    return np.median(min_h_list)   


In [17]:
def distinct_num_15_mer(sequence,a):
    median_min_hn = calculate_min_h(sequence,a)
    # scaling.
    scale_median_min_hn = median_min_hn/0x07ffffffff
    distinct_num_15_mer = round(1/scale_median_min_hn - 1)
    if(a == 1):
        print(f"The number of distinct subsequence of sequence with a in range of {a} is {distinct_num_15_mer}.")
    elif(a > 1):
        print(f"The number of distinct subsequence of sequence with a in range of (1,...,{a}) is {distinct_num_15_mer}.")

In [19]:
distinct_num_15_mer(utf8_sequence,1)

The number of distinct subsequence of sequence with a in range of 1 is 66076419.


In [75]:
distinct_num_15_mer(utf8_sequence,10)

The number of distinct subsequence of sequence with a in range of (1,...,10) is 138827225.


In [21]:
distinct_num_15_mer(utf8_sequence,100)

The number of distinct subsequence of sequence with a in range of (1,...,100) is 201523391.


In [154]:
# testing code:
# import library random.
import random
random.seed(1)
# define function random_fake_sequence(), which is used to randomly generate nucleotide sequence with defined length. 
def random_fake_sequence(length):
    nucleic_acid = ['a','t','c','g','n']
    fake_sequence_list = []
    for i in range(0,length):
        fake_sequence_list.append(random.choice(nucleic_acid))
        # use join() to connect nucleic acids together into fake sequence.
        fake_sequence = "".join(fake_sequence_list)
    # encode fake sequence with utf8, so that it could use hash function.
    fake_sequence = fake_sequence.encode('utf8')
    return fake_sequence

In [155]:
# define function split_into_subsequence(), which is used to split the fake sequence into 15-mers.
def split_into_subsequence(sequence):
    subsequence_list = []
    subsequence = []
    sequence = sequence.decode('utf8')
    for i in range(0,len(sequence)-14):
        for j in range(i, i + 15):
            # extract 15 nucleic acids and save into the list subsequence_list.
            subsequence_list.append(sequence[j])
        # use join() to connect nucleic acids together into 15-mers.
        subsequence_each = "".join(subsequence_list)
        # save each 15-mer into the list subsequence.
        subsequence.append(subsequence_each)
        # clear the list subsequence_list for a new loop through.
        subsequence_list.clear()
    return subsequence

In [156]:
# generate fake sequences that is 1000, 10000, 100000 long.
fake_sequence_1000 = random_fake_sequence(1000)
fake_sequence_10000 = random_fake_sequence(10000)
fake_sequence_100000 = random_fake_sequence(100000)
# split each fake sequence into 15-mers, save into the list fake_subsequence.
fake_subsequence_1000 = split_into_subsequence(fake_sequence_1000)
fake_subsequence_10000 = split_into_subsequence(fake_sequence_10000)
fake_subsequence_100000 = split_into_subsequence(fake_sequence_100000)

In [157]:
# estimate the number of distinct 15-mers in fake_sequence_1000 using hash function.
distinct_num_15_mer(fake_sequence_1000,1)
distinct_num_15_mer(fake_sequence_1000,10)
distinct_num_15_mer(fake_sequence_1000,100)

The number of distinct subsequence of sequence with a in range of 1 is 7239.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 2955.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 1376.


In [158]:
# estimate the number of distinct 15-mers in fake_sequence_10000 using hash function.
distinct_num_15_mer(fake_sequence_10000,1)
distinct_num_15_mer(fake_sequence_10000,10)
distinct_num_15_mer(fake_sequence_10000,100)

The number of distinct subsequence of sequence with a in range of 1 is 43241.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 28767.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 15860.


In [159]:
# estimate the number of distinct 15-mers in fake_sequence_100000 using hash function.
distinct_num_15_mer(fake_sequence_100000,1)
distinct_num_15_mer(fake_sequence_100000,10)
distinct_num_15_mer(fake_sequence_100000,100)

The number of distinct subsequence of sequence with a in range of 1 is 798320.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 80436.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 139205.


In [132]:
# define function num_distinct_subsequence(), which is used to test the results of function distinct_num_15_mer().
def num_distinct_subsequence(fake_subsequence,length):
    # turn the list fake_subsequence into set, to remove the duplicate 15-mers.
    fake_subsequence_set = set(fake_subsequence)
    # the length of the set is the number of distinct 15-mers.
    num_distinct_subsequence = len(fake_subsequence_set)
    
    print(f"The actual number of distinct subsequence in fake subsequence({length} nucleotides) is {num_distinct_subsequence}.")

In [160]:
# show the actual number of distinct 15-mers in fake_sequence_1000, fake_sequence_10000, fake_sequence_100000.
num_distinct_subsequence(fake_subsequence_1000,1000)
num_distinct_subsequence(fake_subsequence_10000,10000)
num_distinct_subsequence(fake_subsequence_100000,100000)

The actual number of distinct subsequence in fake subsequence(1000 nucleotides) is 986.
The actual number of distinct subsequence in fake subsequence(10000 nucleotides) is 9986.
The actual number of distinct subsequence in fake subsequence(100000 nucleotides) is 99986.


In [161]:
# get short_sequence_1000, short_sequence_10000, short_sequence_100000, which are the first 1000, 10000, 100000 of the total sequence.
short_sequence_1000 = utf8_sequence[0:1000]
short_sequence_10000 = utf8_sequence[0:10000]
short_sequence_100000 = utf8_sequence[0:100000]
short_sequence_1000000 = utf8_sequence[0:1000000]
short_subsequence_1000 = split_into_subsequence(short_sequence_1000)
short_subsequence_10000 = split_into_subsequence(short_sequence_10000)
short_subsequence_100000 = split_into_subsequence(short_sequence_100000)
short_subsequence_1000000 = split_into_subsequence(short_sequence_1000000)

In [139]:
# estimate the number of distinct 15-mers in short_sequence_1000 using hash function.
distinct_num_15_mer(short_sequence_1000,1)
distinct_num_15_mer(short_sequence_1000,10)
distinct_num_15_mer(short_sequence_1000,100)

The number of distinct subsequence of sequence with a in range of 1 is 2.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 2.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 1.


In [140]:
# estimate the number of distinct 15-mers in short_sequence_10000 using hash function.
distinct_num_15_mer(short_sequence_10000,1)
distinct_num_15_mer(short_sequence_10000,10)
distinct_num_15_mer(short_sequence_10000,100)

The number of distinct subsequence of sequence with a in range of 1 is 2.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 2.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 1.


In [141]:
# estimate the number of distinct 15-mers in short_sequence_100000 using hash function.
distinct_num_15_mer(short_sequence_100000,1)
distinct_num_15_mer(short_sequence_100000,10)
distinct_num_15_mer(short_sequence_100000,100)

The number of distinct subsequence of sequence with a in range of 1 is 127919.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 78723.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 118780.


In [162]:
# estimate the number of distinct 15-mers in short_sequence_1000000 using hash function.
distinct_num_15_mer(short_sequence_1000000,1)
distinct_num_15_mer(short_sequence_1000000,10)
distinct_num_15_mer(short_sequence_1000000,100)

The number of distinct subsequence of sequence with a in range of 1 is 922557.
The number of distinct subsequence of sequence with a in range of (1,...,10) is 1179935.
The number of distinct subsequence of sequence with a in range of (1,...,100) is 1487498.


In [165]:
# show the actual number of distinct 15-mers in short_sequence_1000, short_sequence_10000, short_sequence_100000, short_sequence_1000000.
num_distinct_subsequence(short_subsequence_1000,1000)
num_distinct_subsequence(short_subsequence_10000,10000)
num_distinct_subsequence(short_subsequence_100000,100000)
num_distinct_subsequence(short_subsequence_1000000,1000000)

The actual number of distinct subsequence in fake subsequence(1000 nucleotides) is 1.
The actual number of distinct subsequence in fake subsequence(10000 nucleotides) is 1.
The actual number of distinct subsequence in fake subsequence(100000 nucleotides) is 87191.
The actual number of distinct subsequence in fake subsequence(1000000 nucleotides) is 931157.
