# Bioinformatika Lab. 1

In [66]:
from Bio import SeqIO
from Bio.Seq import Seq
from bio_utils import *
from Bio import Align
import numpy as np
import glob

In [67]:
start = ["ATG"]
stop = ["TAA", "TAG", "TGA"]
convert = {"C" : "G", "G" : "C", "T" : "A", "A" : "T"}
data_path = "./data/"
output_path = "./out/"
file_format = "fasta"
proteins_out = output_path + "proteins.txt"
phylip_out = output_path + "phylip.phy"

sequences = []

In [68]:
for file in glob.glob(data_path + '*.' + file_format):
    rec = SeqIO.read(file, file_format)
    sequences.append(rec.seq)

In [69]:
def complement(data):
    compl = []
    for var in data:
        compl.append(convert[var])
    return compl

In [70]:
def reverse(data):
    return "".join(list(reversed(data)))

In [71]:
def findOrfs(dna, st, numb, revert=False):
    frame = "-" if revert else ""
    frame += str(st)
    orfstart = -1
    orfend = -1
    orfs = []
    data = reverse(complement(dna)) if revert else dna
    for index in range(st, len(data), 3):
        codon = data[index:index+3]
        last_start = 0
        if codon in start:
            if orfstart == -1:
                orfstart = index
        if codon in stop:
            orfend = index + 3
            if orfend > orfstart and orfstart != -1:
                value = data[orfstart:orfend]
                if orfend - orfstart > 100:
                    orfs.append(Orf(orfstart, orfend, frame, value, (orfend - orfstart), "orf"+str(len(orfs)+numb+1)))
                orfstart = -1
    return orfs

In [72]:
def getAllOrfs(dna, start=0):
    orfs = []
    numb = start
    for frame in range(0, 3):
        orfs.extend(findOrfs(dna, frame, numb))
        numb = len(orfs) + start
    
    for frame in range(0, 3):
        orfs.extend(findOrfs(dna, frame, numb, True))
        numb = len(orfs) + start
    return orfs    

In [73]:
orfs = []
for seq in sequences:
    orfs.extend(getAllOrfs(seq, len(orfs)))
print("Number of orfs: {}".format(len(orfs)))
for o in orfs:
    print(o)

Number of orfs: 35
orf1 <Start: 1494, Stop: 1812, Length: 318, Frame: 0>
orf2 <Start: 2400, Stop: 2793, Length: 393, Frame: 0>
orf3 <Start: 2799, Stop: 2925, Length: 126, Frame: 0>
orf4 <Start: 3279, Stop: 3462, Length: 183, Frame: 0>
orf5 <Start: 4404, Stop: 4671, Length: 267, Frame: 0>
orf6 <Start: 4677, Stop: 4848, Length: 171, Frame: 0>
orf7 <Start: 4857, Stop: 5007, Length: 150, Frame: 0>
orf8 <Start: 391, Stop: 1264, Length: 873, Frame: 1>
orf9 <Start: 1684, Stop: 2644, Length: 960, Frame: 1>
orf10 <Start: 2863, Stop: 3088, Length: 225, Frame: 1>
orf11 <Start: 533, Stop: 962, Length: 429, Frame: 2>
orf12 <Start: 1946, Stop: 2117, Length: 171, Frame: 2>
orf13 <Start: 2261, Stop: 2372, Length: 111, Frame: 2>
orf14 <Start: 2990, Stop: 3200, Length: 210, Frame: 2>
orf15 <Start: 291, Stop: 393, Length: 102, Frame: -0>
orf16 <Start: 4092, Stop: 4305, Length: 213, Frame: -0>
orf17 <Start: 4962, Stop: 5112, Length: 150, Frame: -0>
orf18 <Start: 5169, Stop: 5280, Length: 111, Frame: -0>
o

In [74]:
rnas = toRNAs(orfs)

In [75]:
proteins = toProteins(rnas)
writeToFile(proteins_out, proteins)

## Alignment

In [76]:
aligner = Align.PairwiseAligner()
aligner.open_gap_score = -10
aligner.extend_gap_score = -0.5

In [77]:
def getComparisons(orfs):
    comparisons = []
    for i in range(0, len(orfs)):
        for j in range(i+1, len(orfs)):
            al = aligner.align(orfs[i].value, orfs[j].value)
            comparisons.append(Comparison(orfs[i],orfs[j],al))
    return comparisons

In [78]:
def getScores(values):
    arr = np.zeros((len(values), len(values)))
    for i in range(0, len(values)):
        for j in range(0, len(values)):
            arr[i,j] = aligner.score(values[i], values[j])
    return arr

In [79]:
#comparisons = getComparisons(orfs)
#comparisons.sort(key=lambda x: x.alignment.score, reverse=False)
#for c in comparisons:
#    print("{} and {} Score: {}".format(c.firstOrf.name, c.secondOrf.name, c.alignment.score))

In [80]:
values = list(map(lambda x: x.value, orfs))    #DNAs

In [81]:
scores = getScores(values)

## Distances

In [82]:
import pandas as pd
from scipy.spatial import distance_matrix

data = np.asarray(scores)
df = pd.DataFrame(data)

In [83]:
distances = pd.DataFrame(distance_matrix(df.values, df.values))

In [84]:
rows = list(map(lambda x: x.name, orfs))

rows = np.array(rows)[:, np.newaxis]

In [85]:
np.savetxt(phylip_out, np.hstack((rows, distances.values)), fmt='%s', header=str(len(rows)),comments='')

## Trees

#### Tree generator tool: http://www.trex.uqam.ca/index.php?action=trex&menuD=1&method=2

### Plasmid tree

![title](./img/plasmid.svg)

### Plasmid and Test tree

![title](./img/plasmidAndTest.svg)