# Hashmap

In [1]:
import re
import numpy as np
class HashTable:

	# Create empty bucket list of given size
	def __init__(self, size):
		self.size = size
		self.hash_table = self.create_buckets()

	def create_buckets(self):
		return [[] for _ in range(self.size)]

	# Insert values into hash map
	def set_val(self, key, val):
		
		# Get the index from the key
		# using hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key to be inserted
			if record_key == key:
				found_key = True
				break

		# If the bucket has same key as the key to be inserted,
		# Update the key value
		# Otherwise append the new key-value pair to the bucket
		if found_key:
			bucket[index] = (key, val)
		else:
			bucket.append((key, val))

	# Return searched value with specific key
	def get_val(self, key):
		
		# Get the index from the key using
		# hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key being searched
			if record_key == key:
				found_key = True
				break

		# If the bucket has same key as the key being searched,
		# Return the value found
		# Otherwise indicate there was no record found
		if found_key:
			return record_val
		else:
			return "No"

	# Remove a value with specific key
	def delete_val(self, key):
		
		# Get the index from the key using
		# hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key to be deleted
			if record_key == key:
				found_key = True
				break
		if found_key:
			bucket.pop(index)
		return

	# To print the items of hash map
	def __str__(self):
		return "".join(str(item) for item in self.hash_table)


In [2]:
#read list from file baseMatrix.txt
#calculate probability for each element in the list
file=open("baseMatrix.txt","r")
line=file.readline()
baseMatrix=[]
while(line):
    line=line.strip()
    line=line.split()
    baseMatrix.append(line)
    line=file.readline()
file.close()
#convert baseMatrix to float
baseMatrix=[[float(i) for i in j] for j in baseMatrix]
#make baseMatrix a numpy array
baseMatrix=np.array(baseMatrix)
# print(baseMatrix)

# get Insertion counts
file=open("insertion.txt","r")
line=file.readline()
line=line.split()
Ilen=float(line[0])
Icount=float(line[1])
line=file.readline()
insertionCount=line
insertionCount=insertionCount.split()
insertionCount=list(map(float,insertionCount))
insertionCount=np.array(insertionCount)
file.close()

# get deletion counts
file=open("deletion.txt","r")
line=file.readline()
line=line.split()
Dlen=float(line[0])
Dcount=float(line[1])
line=file.readline()
deletionCount=line
deletionCount=deletionCount.split()
deletionCount=list(map(float,deletionCount))
deletionCount=np.array(deletionCount)
file.close()

# get total parsed base count
file=open("parselog.txt","r")
line=file.readline()
Mlen=float(line)

print("Ilen:",Ilen)
print("Icount:",Icount)
print("Dlen:",Dlen)
print("Dcount:",Dcount)
print("Mlen:",Mlen)
InsertionProb=Ilen/(Icount*Mlen)
DeletionProb=Dlen/(Mlen*Dcount)

Ilen: 34190402.0
Icount: 20681987.0
Dlen: 152584408.0
Dcount: 13434376.0
Mlen: 360274608.0


In [3]:
#log10(baseMatrix)
# baseMatrix=10*baseMatrix
baseMatrixLog=np.log10(baseMatrix)

In [4]:
#read list from contigs.txt
fasta_file = "E:\\Studies\\4-1\\CSE 400\\drive\\outputs\\canu\\real_SAMN10819805_pacbio_00_canu_v1.9.fasta"
hash_table_contig=HashTable(100)
contigFile=open(fasta_file,"r")
line=contigFile.readline()
contig=""
contigsCount=0
contigName=""
contigLengthTotal=0
while line:
    if line[0]=='>':
        if contigsCount==0:
            contigName=re.split("\t| ",line)[0][1:].strip()
            contigsCount+=1
            line=contigFile.readline()

        else:
            hash_table_contig.set_val(contigName,contig)
            contigName=re.split("\t| ",line)[0][1:].strip()
            contigsCount+=1
            contigLengthTotal+=len(contig)
            contig=""
            line=contigFile.readline()
        continue
    line=line.strip()
    contig += line
    line=contigFile.readline()
contigFile.close()
hash_table_contig.set_val(contigName,contig)

In [5]:
hash_table = HashTable(50000)
posProbCount=0
mappedFile=open("mappedread.txt","r")
line=mappedFile.readline()
while line:
    line=line.strip()
    line=re.split("\t",line)
    readName=line[0]

    flagg=line[1]

    strand=(int(flagg)&16)>>4
    contigName=line[2]
    contigs1D=hash_table_contig.get_val(contigName)
    if contigs1D=="No":
        print("No contig found")
        break
    position=int(line[3])-1
    ciger=line[4]
    cigerValues=re.split("S|M|I|D|H",ciger)
    cigerPosition=0

    rawRead=line[5]
    rawRead=re.split("\n",rawRead)
    rawRead=rawRead[0]

    baseMatrixI=0
    baseMatrixJ=0

    readIterator=0
    contigPos=position
    probability=np.log10(len(rawRead)/contigLengthTotal)
    if(strand==1):
        rawRead=rawRead[::-1]
        rawRead=list(rawRead) 
        for i in range(len(rawRead)):
            if(rawRead[i]=='A'):
                rawRead[i]='T'
            elif rawRead[i]=='T':
                rawRead[i]='A'
            elif rawRead[i]=='G':
                rawRead[i]= 'C'
            elif rawRead[i]=='C':
                rawRead[i]= 'G'
        rawRead="".join(rawRead)
    for i in range(len(cigerValues)-1):
        value=int(cigerValues[i])
        cigerPosition+=len(cigerValues[i])
        cigerOperation=ciger[cigerPosition]
        cigerPosition+=1

        if cigerOperation=="M":
            for it in range(value):
                
                fr=rawRead[readIterator]
                to=contigs1D[contigPos]
                if fr=="A":
                    baseMatrixI=0
                elif fr=="T":
                    baseMatrixI=1
                elif fr=="G":
                    baseMatrixI=2
                elif fr=="C":
                    baseMatrixI=3

                if to=="A":
                    baseMatrixJ=0
                elif to=="T":
                    baseMatrixJ=1
                elif to=="G":
                    baseMatrixJ=2
                elif to=="C":
                    baseMatrixJ=3

                probability+=baseMatrixLog[baseMatrixI][baseMatrixJ]
                readIterator+=1
                contigPos+=1
        elif cigerOperation=="I":
            readIterator+=value
            probability+=np.log10(insertionCount[value]*InsertionProb)

        elif cigerOperation=="D":
            contigPos+=value
            probability+=np.log10(deletionCount[value]*DeletionProb)
        elif cigerOperation=="S":
            readIterator+=value
            probability+=np.log10(deletionCount[value]*DeletionProb)
        elif cigerOperation=="H":
            readIterator+=value
            probability+=np.log10(deletionCount[value]*DeletionProb)
    prev=hash_table.get_val(readName)
    hash_table.set_val(readName,prev +'\t'+ str(probability))
    line=mappedFile.readline()
mappedFile.close()

In [6]:
hash_table2=HashTable(50000)
mappedFile=open("mappedread.txt","r")
line=mappedFile.readline()
probStr=[]
while line:
    line=line.strip()
    line=re.split("\t",line)
    readName=line[0]
    if(hash_table2.get_val(readName)=="No"):
        hash_table2.set_val(readName,1)
        probStr.append(hash_table.get_val(readName).split('\t'))
    line=mappedFile.readline()
mappedFile.close()

In [7]:
for i in range(len(probStr[0])):
    print(i , ":", probStr[0][i])

0 : No
1 : -409.60040659080613
2 : -217.0169672291557


In [8]:

newArrayWithDuplicates=[]
arrayOfSingleProbability=[]
for i in range(len(probStr)):
    if(len(probStr[i]) > 2):
        newArrayWithDuplicates.append(probStr[i][1:])
    else:
        arrayOfSingleProbability.append(float(probStr[i][1]))
print(len(newArrayWithDuplicates))
print(newArrayWithDuplicates[0])
print(len(arrayOfSingleProbability))
print(arrayOfSingleProbability[0])

#write newArrayWithDuplicates to file
file=open("log10multipleProbForSameRead.txt","w")
for i in range(len(newArrayWithDuplicates)):
    for j in range(len(newArrayWithDuplicates[i])):
        file.write(newArrayWithDuplicates[i][j])
        file.write("\t")
    file.write("\n")
file.close()
file=open("log10singleProb.txt","w")
file.write(str(sum(arrayOfSingleProbability)))
file.close()

6663
['-409.60040659080613', '-217.0169672291557']
44597
-1375.529479704059
