# Randmisation


* import random module which depends on pseudo random num generator() (PRNG)
* PRNG is essentially any algo that generates random but reproducible data.

In [1]:
# random methods

import random 

x = random.random() # no arguments req; generates random float between 0.0-1.0
print(x)
x = random.randint(1,100) # two arguments req, start and stop; generates random int between given integer range
print(x)
x = random.randrange(1,10) # two arguments req, start stop and step; selects random element from given range
print(x)
x = random.choice('computer') # one arguments req; selects random element from given iterable; IF EMPTY--->IndexError
print(x)
x = random.choices([11,22,33,44,55,66],k=3) # two arguments req; selects multiple random elements from given iterable; IF EMPTY--->IndexError; k = number of elements to be chosen
print(x)
x = random.sample([11,22,33,44,55,66], 4) # two arguments req; sampling without replacement from given iterable; second argument defined the number of uique choices
print(x)
x = random.shuffle([11,22,33,44,55,66]) # randomly shuffles the iterable
print(x)


0.8615351860931483
55
6
r
[11, 44, 55]
[11, 66, 44, 55]
None


In [2]:
# seed method
# initialises RNG (intialise by customised seed value) cuz it requires seed value to start random no generation
# by default it uses CURRENT SYSTEM TIME
import random
random.seed(10)
print(random.random())
# output float stayes the same

0.5714025946899135


* ITERATOR FUNCTION

    * implicitly used when dealing with iterables
    * usual method is to use for loop (MOST BASIC LOOP -- WHILE LOOP)
    * syntax: ```iter()```
    * object representing data stream, returns one element at a time
    * ```next()```: returns next element in iterator till end (WHEN NO MORE VALUES----> StopIteration error)
    * Iterable object: object from which we get an iterator

In [3]:
# iter()

l1 = [11,22,33,44,55,66]
l1_it = iter(l1)
# OR U CAN DO

while True:
    try:
        ele = next(l1_it)
        print(ele)
    except StopIteration:
        break

11
22
33
44
55
66


* GENERATOR FUNCTION
    * function returning iterator obj with sequence of values
    * ```yield``` statement is used
    * cant include return keyword; if done FUNCTION TERMINATES
    * diff yeild and return: 
        * yield: gives value and pasues the execution and internal states are maintained
        * return: gives value and terminates the execution
    * ADV: dynamic element generation(next is generated only if previous one is consumed)

In [4]:
# GENERATOR FUNCTION

def mygenerator():
    print("first")
    yield 10
    print("second")
    yield 20
    print("third")
    yield 30

gen = mygenerator()
while True:
    try:
        print(next(gen))
    except StopIteration:
        break


first
10
second
20
third
30


In [5]:
# GENERATOR FUNCTION WITH return
 
def mygenerator():
    print("first")
    yield 10
    return 

    print("second")
    yield 20
    print("third")
    yield 30

gen = mygenerator()
while True:
    try:
        print(next(gen))
    except StopIteration:
        break
# on the second next(gen)
# THIS WILL BE THE ERROR
'''---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[21], line 1
----> 1 next(gen)

StopIteration:'''

first
10


'---------------------------------------------------------------------------\nStopIteration                             Traceback (most recent call last)\nCell In[21], line 1\n----> 1 next(gen)\n\nStopIteration:'

In [6]:
# get sequence upto a range

def getsequpto(x):
    for i in range(x):
        yield i

seq = getsequpto(5)
while True:
    try:
        print(next(seq))
    except StopIteration:
        break
next(seq)

0
1
2
3
4


StopIteration: 

In [27]:
# get square of sequence upto a range

def getsequpto(x):
    for i in range(x):
        yield i*i

seq = getsequpto(5)
while True:
    try:
        print(next(seq))
    except StopIteration:
        break


0
1
4
9
16


# Biopython

* INTRO:

    * open source mol bio project for python (http://www.biopython.org)
    * functionality:
        * parsing bioinfo files into usable DS (Fasta, clustalw, GenBank, Pubmed, Medline, UniGene, etc); indexing, iterating each records and accessing done via dictionary interface
        * codes that can deal with online bioinfo destinations, alignments
        * codes that can create and deal with subs matrix 
        * interface to common bioinfo progs like (NCBIs standalone blast, clustalw, EMBOSS command line tool)
        * common operations on seqs(transl, transcrip,weight calc)
        * data classification using ML tools
        * codes that can carryout parallelisation of compatible tasks
        * codes GUI programs for basic sequence manipulation
        * Extensive documentation; cookbook style
        * integration with BioSQL (seq DB schema)
        * creates tree-view type files
    * installation: ```pip install biopython```
    * update: ```pip install --upgrade biopython```
    * goals:
        * provide simple/std/extensive access to bioinfo through python using:
            * high qual/reusable modules and scripts
            * fast array modification (used in Cluster Code, PDC, NaiveBayes, MarkovModel)
            * Genome Data Analysis

        

![image.png](attachment:image.png)

In [None]:
# BASIC OPERATIONS

from Bio.Seq import Seq

s1 = Seq("ATCG") # just like string; doesnt represent any sequence: generic 
s1 # outuput is different when print() is not used

Seq('ATCG')

In [33]:
print(s1)

ATCG


![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)
![image-4.png](attachment:image-4.png)
![image-5.png](attachment:image-5.png)

In [50]:
# Accessing elements in sequence

from Bio.Seq import Seq
seq_str = Seq("AGCTGATGCT")
print(seq_str[0])
print(seq_str[0:2])
print(seq_str[:3])
print(seq_str[:])

# length and count
print("length:",len(seq_str))
print("Count:", seq_str.count("A"))

# Add two seqs
seq_str_2 = Seq("ATCGCTAGCT")
print(seq_str+seq_str_2)

# case change
seq_str_3 = Seq("atctaggctagct")
print(seq_str_3.upper())

# can use membership and identity operator

# find the letter
print(seq_str_2.find("T")) # first occurance returned

# can use split and strip

# Complement and reverse compliment
print("OG sequence: ",seq_str_2)
print("Complement of OG:",seq_str_2.complement())
print("reverse complement of OG:",seq_str_2.reverse_complement())

# GC content by GC function
from Bio.SeqUtils import gc_fraction
print("GC content is: ",gc_fraction(seq_str)*100)

# transcribe and reverse transcribe
from Bio.Seq import transcribe
rna_seq_str = transcribe(seq_str)
print("transcribed DNA",rna_seq_str)
print("reverse transcribed RNA",rna_seq_str.back_transcribe())


A
AG
AGC
AGCTGATGCT
length: 10
Count: 2
AGCTGATGCTATCGCTAGCT
ATCTAGGCTAGCT
1
OG sequence:  ATCGCTAGCT
Complement of OG: TAGCGATCGA
reverse complement of OG: AGCTAGCGAT
GC content is:  50.0
transcribed DNA AGCUGAUGCU
reverse transcribed RNA AGCTGATGCT


![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)
![image-4.png](attachment:image-4.png)
![image-5.png](attachment:image-5.png)

* Parsing different file formats

- DB file formats: FASTA, GenBank, PDB
- Sequence Alignment File Formats: BLAST, Clustal-W


In [10]:
#  DB file format parsing

#FASTA: id and seq

from Bio import SeqIO

seq1 = SeqIO.parse("PSA_MSA.txt", "demo_fasta")

for i in seq1:
    print(i.id)
    print(i.seq)

ACO71758.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHDVIYGTNWVTVKRKDVGLGCGRESLSLAQSLRYSCDIVAGLVFLHSRLIVHLDLKPANIFITEQ
ACO71757.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHDVIYGTNWVTVKRKDVXLGCGRESLSLXQSLRYSCDIVAGLVFLHSRLIVHLDLKPANIFITEQ
ACO71756.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHDVIYGTNWVTVKRKDVGLGCGRESLSLAQSLRYSCDIVAGLVFLHSRLIVHLDLKPANIFITEQ
ACO71755.1
SRQSFWAELNVARLXHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHHVIYGTNWVTAKRKDDGLGCGRESLSLAQSLRYSCDIVAGLVFLHSQFIVHLDLKPANIFITEQ
ACO71754.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHHVIYGTNWVTAKRKDDGLGCGRESLSLAQSLRYSCDIVAGLVFLHSQLIVHLDLKPANIFITEQ
ACO71752.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHHVIYGTNWVTAKRKDDGLGCGRESLSLAQSLRYSCDIVAGLVFLHSQLIVHLDLKPANIFITEQ
ACO71751.1
SRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHHVIYGTNWVTAKRKDDGLGCGRESLSLAQSLRYSCDIVAGLVFLHSQLIVHLDLKPANIFITEQ
ACO71746.1
LRQSFWAELNVARLDHQNVVRVIAASTCSPAGQDSLGTIIMEYVGNSTLHDVIYRTNWVTAKRKDGGLGCGRESLSLAQSLRYSCDIVAGLVFLHSQLIVHLDLKPA

In [13]:
#  DB file format parsing

#GenBank: id, name, description, features(type, location, qualifiers), seq 

from Bio import SeqIO

seq1 = SeqIO.parse("sequence.gb", "demo_genbank")

for i in seq1:
    print(i.id)
    print(i.name)
    print(i.description)
    print(i.features)
    print(i.seq)

# for features

    for j in i.features:
        print(j.type)
        print(j.location)
        print(j.qualifiers)

EU569685.1
EU569685
Homo sapiens isolate E03_350-12f cytochrome c oxidase subunit II (COX2) gene, partial cds; mitochondrial
[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(342), strand=1), type='source', qualifiers=...), SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(269), strand=1), type='gene', qualifiers=...), SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(269), strand=1), type='CDS', qualifiers=...)]
ACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTGAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAG
source
[0:342](+)
{'organism': ['Homo sapiens'], 'organelle': ['mitochondrion'], 'mol_type': ['genomic DNA'], 'isolate': ['E03_350-12f'], 'isolation_source': ['severe oligoasthenospermic patient'], 'db_xref': ['ta

In [None]:
#  DB file format parsing

#PDB:  has heirarchy:  structure-->model--->chain---->residue--->atom

from Bio.PDB import PDBParser

seq1 = PDBParser().get_structure("demo_PDB",'1a7f.pdb')

for model in seq1:
    for chain in model:
        for residue in chain:
            for atom in residue:
                print(atom)

In [36]:
# Sequence alignment file format parsing


# BLAST: alignments, hits

from Bio import Blast

seq1 = Blast.parse("JR71D09B016-Alignment.xml")

for i in seq1:
    print(i[0]) # first alignment
    for j in i[0]: # to see hits in the first alignment
        print(j)
        print("all the targets in hits")
        print(j.target)

Query: Query_5439191
       sample
  Hit: ref|XP_004627141.1| (length=109)
       insulin [Octodon degus] >sp|P17715.2| RecName: Full=Insulin; Contains:
       RecName: Full=Insulin B chain; Contains: RecName: Full=Insulin A chain;
       Flags: Precursor [Octodon degus] >gb|AAA40590.1| insulin [Octodon degus]
       >gb|UNP61771.1| insulin preproprotein [Octodon degus]
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0   1.2e-74     227.25     109          [0:109]                [0:109]
Query : Query_5439191 Length: 109 Strand: Plus
        sample
Target: ref|XP_004627141.1| Length: 109 Strand: Plus
        insulin [Octodon degus] >sp|P17715.2| RecName: Full=Insulin; Contains:
        RecName: Full=Insulin B chain; Contains: RecName: Full=Insulin A chain;
        Flags: Precursor [Oct

In [37]:
# Sequence alignment file format parsing


# Clustal-W: id and seq 

from Bio import AlignIO

seq1 = list(AlignIO.parse("insulin.aln","clustal")) # TYPECAST TO LIST!!!!!!!!!!!!!!!

first_align=seq1[0]
#print(first_align)
for i in first_align:
    print(i.seq)
    print(i.id)



MAVWLQAGALLVLLVVSS-VSTNPGTPQHLCGSHLVDALYLVCGPTGFFYNPKRDVEPLLGFLP-PKSAQETEVADFAFKDHAELIRKRGIVEQCCHKPCSIFELQNYCN--
NP_571131.1
MAPWMHLLTVLALLALWGPNSVQAYSSQHLCGSNLVEALYMTCGRSGF-YRPHDRRELEDLQVEQAELG--LEAGGLQPSALEMILQKRGIVDQCCNNICTFNQLQNYCNVP
AAA40590.1
MASLAALLPLLALLVLCRLDPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRREVEELQVGQAELGGGPGAGGLQPSALELALQKRGIVEQCCTSICSLYQLENYCN--
AAA19033.1
MALWTRLLALLALLALGAPTPARAFANQHLCGSHLVEALYLVCGERGFFYTPKARREVEDTQVGGVELGGGPGAGGLQPLGPEGRPQKRGIVEQCCASVCSLYQLENYCN--
KAB1251309.1
MALWTRLVPLLALLALWAPAPAHAFVNQHLCGSHLVEALYLVCGERGFFYTPKARREVEGPQVGALELAGGPGAGGL-----EGPPQKRGIVEQCCAGVCSLYQLENYCN--
AAB60625.1
