In [2]:
import re
import numpy as np

class HashTable:

	# Create empty bucket list of given size
	def __init__(self, size):
		self.size = size
		self.hash_table = self.create_buckets()

	def create_buckets(self):
		return [[] for _ in range(self.size)]

	# Insert values into hash map
	def set_val(self, key, val):
		
		# Get the index from the key
		# using hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key to be inserted
			if record_key == key:
				found_key = True
				break

		# If the bucket has same key as the key to be inserted,
		# Update the key value
		# Otherwise append the new key-value pair to the bucket
		if found_key:
			bucket[index] = (key, val)
		else:
			bucket.append((key, val))

	# Return searched value with specific key
	def get_val(self, key):
		
		# Get the index from the key using
		# hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key being searched
			if record_key == key:
				found_key = True
				break

		# If the bucket has same key as the key being searched,
		# Return the value found
		# Otherwise indicate there was no record found
		if found_key:
			return record_val
		else:
			return "No record found"

	# Remove a value with specific key
	def delete_val(self, key):
		
		# Get the index from the key using
		# hash function
		hashed_key = hash(key) % self.size
		
		# Get the bucket corresponding to index
		bucket = self.hash_table[hashed_key]

		found_key = False
		for index, record in enumerate(bucket):
			record_key, record_val = record
			
			# check if the bucket has same key as
			# the key to be deleted
			if record_key == key:
				found_key = True
				break
		if found_key:
			bucket.pop(index)
		return

	# To print the items of hash map
	def __str__(self):
		return "".join(str(item) for item in self.hash_table)

In [3]:
hash_table_contig=HashTable(100)
contigFile=open("../real_SAMN10819801_pacbio_00_canu_v1.9.fasta","r")
line=contigFile.readline()
contig=""
contigsCount=0
contigName=""
contigLengthTotal=0
while line:
    if line[0]=='>':
        if contigsCount==0:
            contigName=re.split("\t| ",line)[0][1:].strip()
            contigsCount+=1
            line=contigFile.readline()

        else:
            hash_table_contig.set_val(contigName,contig)
            contigName=re.split("\t| ",line)[0][1:].strip()
            contigsCount+=1
            contigLengthTotal+=len(contig)
            contig=""
            line=contigFile.readline()
        continue
    line=line.strip()
    contig += line
    line=contigFile.readline()
contigFile.close()
hash_table_contig.set_val(contigName,contig)

In [4]:
mappedFile=open("unmappedOut.sam","r")
line=mappedFile.readline()
totalReadChecked=0
baseCountInRead=np.array([[0]*4]*4)
baseMatrix=[[0]*4]*4
baseMatrix=np.array(baseMatrix)
baseMatrix=baseMatrix.astype(int)
insertionCount=np.array([0]*50000)
deletionCount=np.array([0]*50000)
Mlen=0
Scount=0
Hcount=0
Icount=0
Ilen=0
Dcount=0
Dlen=0
totalReadLength=0
while line:
    totalReadChecked+=1
    line=line.strip()
    line=re.split("\t",line)
    flagg=line[1]
    contigName=line[2].strip()
    contigs1D=hash_table_contig.get_val(contigName)
    if contigs1D=="No record found":
        print(contigName)
        print("No record found")
        break
    strand=(int(flagg)&16)>>4
    position=int(line[3])-1
    ciger=line[4]
    cigerValues=re.split("S|M|I|D|H",ciger)
    cigerPosition=0

    rawRead=line[5]
    temp=rawRead.strip()
    baseCountInRead[0]+=rawRead.count('A')
    baseCountInRead[1]+=rawRead.count('T')
    baseCountInRead[2]+=rawRead.count('G')
    baseCountInRead[3]+=rawRead.count('C')


    totalReadLength+=len(rawRead)
    baseMatrixI=0
    baseMatrixJ=0

    readIterator=0
    contigPos=position
    differentCigerOperation=[]

    if(strand==1):
        rawRead=rawRead[::-1]
        rawRead=list(rawRead) 
        for i in range(len(rawRead)):
            if(rawRead[i]=='A'):
                rawRead[i]='T'
            elif rawRead[i]=='T':
                rawRead[i]='A'
            elif rawRead[i]=='G':
                rawRead[i]= 'C'
            elif rawRead[i]=='C':
                rawRead[i]= 'G'
        rawRead="".join(rawRead)
    for i in range(len(cigerValues)-1):
        value=int(cigerValues[i])
        cigerPosition+=len(cigerValues[i])
        cigerOperation=ciger[cigerPosition]
        cigerPosition+=1

        if cigerOperation=="M":
            Mlen+=value
            for it in range(value):
                fr=rawRead[readIterator]
                to=contigs1D[contigPos]
                if fr=="A":
                    baseMatrixI=0
                elif fr=="T":
                    baseMatrixI=1
                elif fr=="G":
                    baseMatrixI=2
                elif fr=="C":
                    baseMatrixI=3
                else:
                    baseMatrixI=4

                if to=="A":
                    baseMatrixJ=0
                elif to=="T":
                    baseMatrixJ=1
                elif to=="G":
                    baseMatrixJ=2
                elif to=="C":
                    baseMatrixJ=3
                else :
                    baseMatrixJ=4

                baseMatrix[baseMatrixI][baseMatrixJ]+=1

                readIterator+=1
                contigPos+=1
        elif cigerOperation=="I":
            readIterator+=value
            insertionCount[value]+=1
            Icount+=1
            Ilen+=value
        elif cigerOperation=="D":
            contigPos+=value
            deletionCount[value]+=1
            Dcount+=1
            Dlen+=value
        elif cigerOperation=="S":
            readIterator+=value
            Scount+=value
            deletionCount[value]+=1
            Dcount+=1
            Dlen+=value
        elif cigerOperation=="H":
            readIterator+=value
            Hcount+=value
            deletionCount[value]+=1
            Dcount+=1
            Dlen+=value
        else:
            differentCigerOperation.append(cigerOperation)

    line=mappedFile.readline()
mappedFile.close()

In [5]:
print(baseMatrix)

[[534  20  25  26]
 [ 21 644  15  23]
 [ 35  18 797  37]
 [120  24  21 747]]


In [5]:
import re
mappedFile = open("unmappedOut.sam", "r")
line = mappedFile.readline()

line = line.strip()
line = re.split("\t| ", line)
print(len(line))
readName = line[0]

flagg = line[1]

contigName = line[2]
position = int(line[3])-1
ciger = line[4]
cigerValues = re.split("S|M|I|D|H", ciger)
cigerPosition = 0

rawRead = line[5]
rawRead = rawRead.strip()
rawReadLength = len(rawRead)


9
