In [17]:
from aminoacid import AminoAcid
from sequence import Sequence, loadFasta
from align import Align, Aligned
from math import sqrt, log

A protein domain is part of a protein that has its own structure and function, independent of the rest of the protein. One such domain is the WW domain, known to be used by multiple species and sometimes included multiple times in a single protein. Despite their use being similar across the proteins they can be found in, the same domains do not necessarily have the same amino acid sequence. However, they do have enough common amino acids to be identified and located by the means of sequence alignment. Our goal in this project is to locate a given domain within a given sequence, should it exist one or more times within that sequence.

In [18]:
# AA frequencies for complete UniProt database
# from http://web.expasy.org/docs/relnotes/relstat.html, "AMINO ACID COMPOSITION"
uniprob = {
	AminoAcid("Ala") : .0826,
	AminoAcid("Gln") : .0393,
	AminoAcid("Leu") : .0965,
	AminoAcid("Ser") : .0660,
	AminoAcid("Arg") : .0553,
	AminoAcid("Glu") : .0674,
	AminoAcid("Lys") : .0582,
	AminoAcid("Thr") : .0535,
	AminoAcid("Asn") : .0406,
	AminoAcid("Gly") : .0708,
	AminoAcid("Met") : .0241,
	AminoAcid("Trp") : .0109,
	AminoAcid("Asp") : .0546,
	AminoAcid("His") : .0227,
	AminoAcid("Phe") : .0386,
	AminoAcid("Tyr") : .0292,
	AminoAcid("Cys") : .0137,
	AminoAcid("Ile") : .0593,
	AminoAcid("Pro") : .0472,
	AminoAcid("Val") : .0687,
	
}			


class PSSM:
	"""
	Position Specific Score Matrix.
	Creates a profile for a series of aligned sequences, and gives a score to each AA subsitution in a given column.
	"""
	def __init__(self, description=""):
		self.description=description
		self.seqCount = 0 #total number of sequences
		self.size = None #all sequences have the same size
		self.aaDistribution = None #amino acid distribution
		self.aaCount = None
		self.gapPenalties = None
		
	
	def add(self, sequence):
		#check sequence size
		if self.size is None:
			self.size = len(sequence)
			self.aaDistribution = [{} for i in range(self.size)]
			self.aaCount = [0 for i in range(self.size)]
			self.gapPenalties = [0 for i in range(self.size + 1)]
		
		assert(len(sequence) == self.size)
			
		#update amino acid count for each column
		for index in range(self.size):
			if not sequence[index].isGap():
				self.aaCount[index] += 1
				try:
					self.aaDistribution[index][sequence[index]] += 1
				except:
					self.aaDistribution[index][sequence[index]] = 1
		
		#increase sequence count
		self.seqCount += 1
		
	def getDescription(self):
		return self.description
		
	def getScore(self, aminoAcid, columnIndex):
		#pseudocounts
		alpha = self.aaCount[columnIndex] - 1
		beta = sqrt(self.seqCount)
		alphaplusbeta = alpha + beta

		#random probability of amino acid
		try:
			p_aa = uniprob[aminoAcid]
		except:
			p_aa = 0.001
		
		#evolutionary probability of amino acid
		try:
			f_aa = self.aaDistribution[columnIndex][aminoAcid] / self.seqCount
		except:
			f_aa = 0
			
		q_aa = (alpha * f_aa + beta * p_aa) / alphaplusbeta
		
		return log(q_aa / p_aa)
	
	
	def getGapPenalty(self, columnIndex):
		return self.gapPenalties[columnIndex]
	
	
	def setGapPenalty(self, penalty, columnIndex=None):
		if columnIndex is None:
			for i in range(self.size):
				self.gapPenalties[i] = penalty
		else:
			self.gapPenalties[columnIndex] = penalty
	
	def __len__(self):
		return self.size
	
	def __repr__(self):
		for i in range(self.size):
			for key, score in self.aaDistribution[i].items():
				print(key, ": ", score, "(", self.getScore(key, i), ")", sep="",  end=", ")
			print()

**Some explanations**

In [19]:
pssm = PSSM("WW domain")
for seq in loadFasta(r"msaresults-MUSCLE.fasta"):
	pssm.add(seq)
pssm.setGapPenalty(4)

al = Align(pssm)

for toalign in loadFasta(r"test.fasta"):
	for aligned in al.multiAlign(toalign):
		print(aligned)

---------- Multi-Seq. Alignment ----------
Size       : 59
Type       : local
Score      : 25.92
Gaps       : 28

PSSM : WW domain
Aligned seq. : sp|D6C652|YAP1A_XENLA Transcriptional coactivator YAP1-A OS=Xenopus laevis GN=yap1-a PE=1 SV=1
	28 Gaps, 31 AAs (positions 142 to 173)

142
-LPPGWEMAKT-PS-GQR-YFLN------------------------HIDQTTTWQDPR


---------- Multi-Seq. Alignment ----------
Size       : 60
Type       : local-suboptimal(1)
Score      : 23.73
Gaps       : 28

PSSM : WW domain
Aligned seq. : sp|D6C652|YAP1A_XENLA Transcriptional coactivator YAP1-A OS=Xenopus laevis GN=yap1-a PE=1 SV=1
	28 Gaps, 32 AAs (positions 200 to 232)

200
-LPDGWEQALTPEGEA---YFIN------------------------HKNKSTSWLDPRL


---------- Multi-Seq. Alignment ----------
Size       : 5
Type       : local-suboptimal(2)
Score      : 11.06
Gaps       : 1

PSSM : WW domain
Aligned seq. : sp|D6C652|YAP1A_XENLA Transcriptional coactivator YAP1-A OS=Xenopus laevis GN=yap1-a PE=1 SV=1
	1 Gaps, 4 AAs (positions 254 to 258