# MUTATION-DETECTION:

In [1]:
from Bio import SeqIO

normal_samples = SeqIO.parse("CERVICALDNA.fasta", "fasta") 
affected_samples = SeqIO.parse("CERVICALMutated.fasta", "fasta")

for normal, affected in zip(normal_samples, affected_samples): 
    if normal.id == affected.id: 
        mutations = sum(1 for n, t in zip(str(normal.seq), str(affected.seq)) if n != t) 
        print(f"Found {mutations} mutation(s) for id {normal.id}")

Found 233 mutation(s) for id AAK97314.1


# OPTIONAL:--

# 1.SEQUENTIAL PATTERN MINING

In [2]:
def createCandidateSet(data):
    cand = []
    for row in data:
        for itm in row:
            if [itm] not in cand:
                cand.append([itm])
    cand.sort()
    return list(map(frozenset,cand))

def scanData(data, candidateSet, minSupport):
    subsetCount = {}
    for curSet in data:
        for cand in candidateSet:
            if cand.issubset(curSet):
                if not cand in subsetCount:
                    subsetCount[cand] = 1
                else:
                    subsetCount[cand] += 1
    n = float(len(data))
    valid = []
    for key in subsetCount:
        sup = subsetCount[key]
        if sup >= minSupport:
            valid.insert(0,key)
    return valid, subsetCount

def genSequential(freqSets, k):
    valid = []
    nFreqSets = len(freqSets)
    for i in range(nFreqSets):
        for j in range(i+1, nFreqSets):
            lstCands1 = list(freqSets[i])[:k-2]
            lstCands2 = list(freqSets[j])[:k-2]
            lstCands1.sort()
            lstCands2.sort()
            # if first k-2 elements are equal
            if lstCands1 == lstCands2:
                valid.append(freqSets[i]|freqSets[j]) # union 
    return valid

def Seqtial(data, minSupport):
    candSet = createCandidateSet(data)
    setData = list(map(set,data))
    lstCands, subsetCounts = scanData(setData,candSet,minSupport)
    lstCands = [lstCands]
    k = 2
    while(len(lstCands[k-2]) > 0):
        candSetX = genSequential(lstCands[k-2],k)
        lstCandsX, subsetCountsX = scanData(setData,candSetX, minSupport)
        subsetCounts.update(subsetCountsX)
        lstCands.append(candSetX)
        k += 1
    return lstCands, subsetCounts

# read in data
data = ['ATGC','ACGT','ATCG','TTTT','AGTCG']
dataSetFilename = 'CERVICALDNA.fasta'
with open(dataSetFilename,'r') as file:
    for line in file:
        data.append(line.strip().split(','))


print("What min. support do you want to use? ")
minSupp = input()
minSupp = int(minSupp)

print("\n**** Sequence with minSupport = {} ****".format(minSupp))

# call SPM
sets, counts = Seqtial(data,minSupp)
print("\nSets:\n")
for x in sets:
    for y in x:
        print(y)

print("\nCounts:\n")
for k,v in counts.items():
	print(k, v)

What min. support do you want to use? 
3

**** Sequence with minSupport = 3 ****

Sets:

frozenset({'T'})
frozenset({'G'})
frozenset({'C'})
frozenset({'A'})
frozenset({'T', 'G'})
frozenset({'C', 'T'})
frozenset({'A', 'T'})
frozenset({'C', 'G'})
frozenset({'A', 'G'})
frozenset({'C', 'A'})
frozenset({'C', 'T', 'G'})
frozenset({'C', 'T', 'A'})
frozenset({'A', 'T', 'G'})
frozenset({'C', 'G', 'A'})
frozenset({'C', 'A', 'G', 'T'})

Counts:

frozenset({'A'}) 4
frozenset({'C'}) 4
frozenset({'G'}) 4
frozenset({'T'}) 5
frozenset({'>AAK97314.1 cervical cancer proto-oncogene 2 [Homo sapiens]'}) 1
frozenset({'ATGCAGGCGGTGCGCAACGCGGGCAGCCGCTTTCTGCGCAGCTGGACCTGGCCGCAGACC'}) 1
frozenset({'GCGGGCGTGGTGGCGCGCACCCCGGCGGGCACCATTTGCACCGGCGCGCGCCAGCTGCAG'}) 1
frozenset({'GATGCGGCGGCGAAACAGAAAGTGGAACAGAACGCGGCGCCGAGCCATACCAAATTTAGC'}) 1
frozenset({'ATTTATCCGCCGATTCCGGGCGAAGAAAGCAGCCTGCGCTGGGCGGGCAAAAAATTTGAA'}) 1
frozenset({'GAAATTCCGATTGCGCATATTAAAGCGAGCCATAACAACACCCAGATTCAGGTGGTGAGC'}) 1
frozenset({'GCGAGC

# 2.DNA TO PROTEIN TRANSLATION

In [3]:
#Translates an DNA sequence from a fasta file to an amino acid sequence.
from Bio.Seq import Seq
from Bio import SeqIO

def pad_seq(sequence):
    """ Pad sequence to multiple of 3 with N """

    remainder = len(sequence) % 3

    return sequence if remainder == 0 else sequence + Seq('N' * (3 - remainder))

seq_records = SeqIO.parse('CERVICALDNA.fasta', 'fasta')


amino_acids1 = []
amino_acids2 = []
amino_acids3 = []

for record in seq_records:

    # starting from nucleotide 1
    amino_acids1.append(pad_seq(record).translate())
    print("FIRST")
    print(amino_acids1)
    # ...

    # starting from nucleotide 2
    record2 = record[1:]
    amino_acids2.append(pad_seq(record2).translate())
    print("SECOND")
    print(amino_acids2)
    # ...

    # starting from nucleotide 3
    record3 = record[2:]
    amino_acids3.append(pad_seq(record3).translate())
    print("THIRD")
    print(amino_acids3)
    # ...
    
SeqIO.write(seq_records, "DNA2Protein.fasta", "fasta")

FIRST
[SeqRecord(seq=Seq('MQAVRNAGSRFLRSWTWPQTAGVVARTPAGTICTGARQLQDAAAKQKVEQNAAP...RKL', ExtendedIUPACProtein()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]
SECOND
[SeqRecord(seq=Seq('CRRCATRAAAFCAAGPGRRPRAWWRAPRRAPFAPARASCRMRRRNRKWNRTRRR...ANX', HasStopCodon(ExtendedIUPACProtein(), '*')), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]
THIRD
[SeqRecord(seq=Seq('AGGAQRGQPLSAQLDLAADRGRGGAHPGGHHLHRRAPAAGCGGETESGTERGAE...QTX', HasStopCodon(ExtendedIUPACProtein(), '*')), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])]


0