# Bioinformatika Lab. 1

In [80]:
from Bio import SeqIO
from Bio.Seq import Seq
from bio_utils import *
from Bio import Align
import numpy as np
import glob

In [81]:
start = ["ATG"]
stop = ["TAA", "TAG", "TGA"]
convert = {"C" : "G", "G" : "C", "T" : "A", "A" : "T"}
data_path = "./data/"
output_path = "./out/"
file_format = "fasta"
proteins_out = output_path + "proteins.txt"
phylip_out = output_path + "phylip.phy"

sequences = []

In [82]:
for file in glob.glob(data_path + '*.' + file_format):
    rec = SeqIO.read(file, file_format)
    sequences.append(rec.seq)

In [83]:
def complement(data):
    compl = []
    for var in data:
        compl.append(convert[var])
    return compl

In [84]:
def reverse(data):
    return "".join(list(reversed(data)))

In [85]:
def findOrfs(dna, st, numb, revert=False):
    frame = "-" if revert else ""
    frame += str(st)
    orfstart = -1
    orfend = -1
    orfs = []
    data = reverse(complement(dna)) if revert else dna
    for index in range(st, len(data), 3):
        codon = data[index:index+3]
        last_start = 0
        if codon in start:
            if orfstart == -1:
                orfstart = index
        if codon in stop:
            orfend = index + 3
            if orfend > orfstart and orfstart != -1:
                value = data[orfstart:orfend]
                if orfend - orfstart > 100:
                    orfs.append(Orf(orfstart, orfend, frame, value, (orfend - orfstart), "orf"+str(len(orfs)+numb+1)))
                orfstart = -1
    return orfs

In [86]:
def getAllOrfs(dna, start=0):
    orfs = []
    numb = start
    for frame in range(0, 3):
        orfs.extend(findOrfs(dna, frame, numb))
        numb = len(orfs) + start
    
    for frame in range(0, 3):
        orfs.extend(findOrfs(dna, frame, numb, True))
        numb = len(orfs) + start
    return orfs    

In [87]:
orfs = []
for seq in sequences:
    orfs.extend(getAllOrfs(seq, len(orfs)))
print("Number of orfs: {}".format(len(orfs)))
for o in orfs:
    print(o)

Number of orfs: 35
orf1 <Start: 1494, Stop: 1812, Length: 318, Frame: 0>
orf2 <Start: 2400, Stop: 2793, Length: 393, Frame: 0>
orf3 <Start: 2799, Stop: 2925, Length: 126, Frame: 0>
orf4 <Start: 3279, Stop: 3462, Length: 183, Frame: 0>
orf5 <Start: 4404, Stop: 4671, Length: 267, Frame: 0>
orf6 <Start: 4677, Stop: 4848, Length: 171, Frame: 0>
orf7 <Start: 4857, Stop: 5007, Length: 150, Frame: 0>
orf8 <Start: 391, Stop: 1264, Length: 873, Frame: 1>
orf9 <Start: 1684, Stop: 2644, Length: 960, Frame: 1>
orf10 <Start: 2863, Stop: 3088, Length: 225, Frame: 1>
orf11 <Start: 533, Stop: 962, Length: 429, Frame: 2>
orf12 <Start: 1946, Stop: 2117, Length: 171, Frame: 2>
orf13 <Start: 2261, Stop: 2372, Length: 111, Frame: 2>
orf14 <Start: 2990, Stop: 3200, Length: 210, Frame: 2>
orf15 <Start: 291, Stop: 393, Length: 102, Frame: -0>
orf16 <Start: 4092, Stop: 4305, Length: 213, Frame: -0>
orf17 <Start: 4962, Stop: 5112, Length: 150, Frame: -0>
orf18 <Start: 5169, Stop: 5280, Length: 111, Frame: -0>
o

In [88]:
rnas = toRNAs(orfs)

In [89]:
proteins = toProteins(rnas)
writeToFile(proteins_out, proteins)

In [90]:
for o in orfs:
    print("{}\n".format(o.value))

ATGGCGCAAAACCTTTCGCGGTATGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTTCTGCGAAAACGCGGGAAAAAGTGGAAGCGGCGATGGCGGAGCTGAATTACATTCCCAACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAA

ATGTTATATCCCGCCGTTAACCACCATCAAACAGGATTTTCGCCTGCTGGGGCAAACCAGCGTGGACCGCTTGCTGCAACTCTCTCAGGGCCAGGCGGTGAAGGGCAATCAGCTGTTGCCCGTCTCACTGGTGAAAAGAAAAACCACCCTGGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTAATTCCGGATCTGCATCGCAGGATGCTGCTGGCTACCCTGTGGAACACCTACATCTGTATTAACGAAGCGCTGGCATTGACCCTGAGTGATTTTTCTCTGGTCCCGCCGCATCCATACCGCCAGTTGTTTACCCTCACAACGTTCCAGTAA

ATGTTCATCATCAGTAACCCGTATCGTGAGCATCCTCTCTCGTTTCATCGGTATCATTACCCCCATGAACAGAAATCCCCCTTACACGGAGGCATCAGTGACCAAACAGGAAAAAACCGCCCTTAA

ATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAG

## Alignment

In [91]:
CodonsDict = { 
 'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0, 
 'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0, 
 'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0, 
 'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0, 
 'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0, 
 'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0, 
 'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0, 
 'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0, 
 'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0, 
 'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0, 
 'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0, 
 'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0, 
 'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0
}

In [92]:
all_codon_freqs = []
for o in orfs:
    codons = CodonsDict.copy()
    for index in range(0, len(o.value), 3):
        codon = o.value[index:index+3]
        codons[str(codon)] += 1
    all_codon_freqs.append(codons)

In [93]:
print(all_codon_freqs[0])

{'TTT': 0, 'TTC': 2, 'TTA': 2, 'TTG': 0, 'CTT': 1, 'CTC': 2, 'CTA': 0, 'CTG': 0, 'ATT': 1, 'ATC': 0, 'ATA': 2, 'ATG': 2, 'GTT': 2, 'GTC': 2, 'GTA': 1, 'GTG': 0, 'TAT': 1, 'TAC': 0, 'TAA': 1, 'TAG': 0, 'CAT': 1, 'CAC': 1, 'CAA': 3, 'CAG': 3, 'AAT': 1, 'AAC': 3, 'AAA': 1, 'AAG': 0, 'GAT': 3, 'GAC': 1, 'GAA': 5, 'GAG': 1, 'TCT': 1, 'TCC': 1, 'TCA': 2, 'TCG': 1, 'CCT': 1, 'CCC': 1, 'CCA': 2, 'CCG': 2, 'ACT': 1, 'ACC': 1, 'ACA': 3, 'ACG': 0, 'GCT': 2, 'GCC': 1, 'GCA': 2, 'GCG': 3, 'TGT': 4, 'TGC': 3, 'TGA': 0, 'TGG': 1, 'CGT': 5, 'CGC': 3, 'CGA': 1, 'CGG': 2, 'AGT': 2, 'AGC': 1, 'AGA': 2, 'AGG': 0, 'GGT': 3, 'GGC': 8, 'GGA': 4, 'GGG': 1}


In [94]:
all_frequencies = []
for cf in all_codon_freqs:
    numb = 0
    freqs = []
    for codon, freq in cf.items():
        numb += freq
    for codon, freq in cf.items():
        freqs.append(freq/numb)
    all_frequencies.append(freqs)

In [95]:
from scipy.spatial import distance
dst = distance.euclidean(all_frequencies[0], all_frequencies[1])

In [96]:
def getScores(values):
    arr = np.zeros((len(values), len(values)))
    for i in range(0, len(values)):
        for j in range(0, len(values)):
            arr[i,j] = distance.euclidean(values[i], values[j])
    return arr

In [97]:
scores = getScores(all_frequencies)

## Distances

In [98]:
import pandas as pd
from scipy.spatial import distance_matrix

data = np.asarray(scores)
df = pd.DataFrame(data)

In [99]:
distances = pd.DataFrame(distance_matrix(df.values, df.values))

In [100]:
rows = list(map(lambda x: x.name, orfs))

rows = np.array(rows)[:, np.newaxis]

In [101]:
np.savetxt(phylip_out, np.hstack((rows, distances.values)), fmt='%s', header=str(len(rows)),comments='')

## Trees

#### Tree generator tool: http://www.trex.uqam.ca/index.php?action=trex&menuD=1&method=2

### Plasmid tree

![title](./img/plasmid_only.png)

### Plasmid and Test tree

![title](./img/with_test.png)