# COVID-19 analysis
<hr>

In [1]:
import Bio
from Bio import SeqIO # library used to parse the file
from Bio import Seq

#### First step is to open up the file and understand the contents of it

In [2]:
for info in SeqIO.parse("sequence.fasta", "fasta"):  #use a for loop to iterate through the contents in the file
    print(info)

ID: NC_045512.2
Name: NC_045512.2
Description: NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Number of features: 0
Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA', SingleLetterAlphabet())


In [3]:
#store the information in a variable
ncov = SeqIO.read("sequence.fasta", "fasta")
ncov

SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA', SingleLetterAlphabet()), id='NC_045512.2', name='NC_045512.2', description='NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[])

In [4]:
#get the dna seq of covid 
ncov_dna = ncov.seq

In [5]:
ncov_dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA', SingleLetterAlphabet())

In [6]:
#length of sequence
len(ncov_dna)

29903

In [7]:
#Do protein synthesis
# dna->rna->protien
ncov_rna = ncov_dna.transcribe()

In [8]:
ncov_protien = ncov_rna.translate(stop_symbol="*")  # you can understand where the stopcodons lie

#### now we can anlayze the protein compostion of ncov

In [9]:
ncov_protien  #asterics represent wherever the stopcodons are present

Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [10]:
# num of proteins
len(ncov_protien)

9967

In [11]:
#now lets try yto analze the specifics of covid (amino acids)
ncov_aa = ncov_protien.split("*")
ncov_aa

Codon(ExtendedIUPACProtein(), '*')),
 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('PCGYEVQL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TSNTRPC', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('HTRTSFCSNWNCRFRYVCFIKRITAKWYEWTYHIG', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('CFIRR', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('IYTF', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('CC', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TMLRCYFPKCSEKNNQGYTPLVVTHNFDFTFSFSPEYSMVFVLFFV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('KCLFTFCYGYYCYVCFCNDVCQT', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('ACISLFVFVTFSCHCSLF', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('YGLYAC', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('LGDAYYDMVGYG', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('Y', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('FVWF', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('AKRLCYVCISCSVTNPYDSKNCV', HasStopCodon(Exte

In [12]:
ncov = [str(i) for i in ncov_aa]

In [13]:
for i in ncov:
    print(i)

IKGLYLPR
QTNQLSISCRSVL
TNFKICVAVTRLHA
CTHAV
LITNYCR
QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER
DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS
RWHLWLSRS
KRRFAST
TALCVHQTFGCSNCTSWSCYG
AGSRTRRHSVRS
W
DTWCPCPSCGRNTSGLPQGSSS
ER

RSWWP
LRRRSKVI
LRRRAWH
SL
RFSRKLEH
T
QWCYP
THA
A
RRGIHSLCR
QLLWP
WLPS
VH
RPSSTCW
SFMHFVRTTGLY
H
EGCILLP
T
A
NCLVHGTF
KEL
IADTF
N
IGKEI
HLQWGMSKFCISLKFHNQDYSTKG
KEKA
WLYG
NSICLSSCVTK
MQPNVPFNSHEV
SLW
NFMADGRFC
SHLRILWH
EFD
RRCHYLWLLTPKCCC
NLLSSMSQFRSRT
A
SCRIP

IWLENHSS
GWSHYCLWRLCVLLCWLP
QVCLLGSTC
R
HRL
PYRCCWRRFRRS

QPS
NTPKRESQHQYCW
L
T

RDRHYFGIFFCFHKCFCGNCERFGL
SIQTNC
ILW
F
SYKRKS
KRCLEYW
TEINTESSLCICIRGCSCCTINFLPHS
NCSKFCACFTEGRYNNTRWNFTVFTETH
CYDVHI
FGY
QSSCNGLHYRWCCSVDFAVAN
HLWHCL
KTQTRP
LA
REV
GRCRVS
RRLGNC
IYLNLCL
NCRWTNCHLCKGN
GECSDIL
ACK
IFGFVC
LYHYWWS
T
SLEFR
NICHALKGIVQKVC
IQRRNWPTHASKSPKRNYLLRGRNTSHRSVNRGSCLENW
FTTIRTTY

SC
SSIGWYTSLY
RAYVARNQRHRKVLCPCT
YDGNKQYLHTQRRCTNKGYFW

HCDRSARLQECEYHF
T

KD

ST

EVLCLYS
TRYRSK
VRLCCGRCCHKNFATSI
ITYTTGH
FR
VEYGYILLI

VW

In [14]:
#store the amino acids into a df
import pandas as pd

In [15]:
df = pd.DataFrame({'Amino Acids': ncov})

In [16]:
df.head()

Unnamed: 0,Amino Acids
0,IKGLYLPR
1,QTNQLSISCRSVL
2,TNFKICVAVTRLHA
3,CTHAV
4,LITNYCR


In [17]:
#now lets add on to our table by adding in the lenght of each amino acid

df["count"] = df["Amino Acids"].str.len()

In [18]:
df

Unnamed: 0,Amino Acids,count
0,IKGLYLPR,8
1,QTNQLSISCRSVL,13
2,TNFKICVAVTRLHA,14
3,CTHAV,5
4,LITNYCR,7
...,...,...
770,SHIAIFNQCVTLGRT,15
771,KSHHIFTEATRSTIECTVNNARESCLYGRALMCKINFSSAIPM,43
772,F,1
773,,0


In [19]:
df.nlargest(5, "count")

Unnamed: 0,Amino Acids,count
548,CTIVFKRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFL...,2701
694,ASAQRSQITLHINELMDLFMRIFTIGTVTLKQGEIKDATPSDFVRA...,290
719,TNMKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS...,123
695,AQADEYELMYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALR...,83
718,QQMFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL...,63


In [20]:
df.nsmallest(10, "count")

Unnamed: 0,Amino Acids,count
14,,0
52,,0
59,,0
64,,0
93,,0
98,,0
101,,0
103,,0
105,,0
112,,0


In [21]:
from collections import Counter

Counter(ncov_protien).most_common(10)

[('L', 886),
 ('S', 810),
 ('*', 774),
 ('T', 679),
 ('C', 635),
 ('F', 593),
 ('R', 558),
 ('V', 548),
 ('Y', 505),
 ('N', 472)]

In [32]:
from Bio.PDB import PDBParser,MMCIFParser


In [35]:
# Reading a PDB File
parser = PDBParser()
structure = parser.get_structure("6lu7", "6lu7.pdb")

In [36]:
structure

<Structure id=6lu7>

In [37]:
model = structure[0]

In [38]:
for chain in model:
    print(f'chain {chain},chain_ID: {chain.id}')

chain <Chain id=A>,chain_ID: A
chain <Chain id=C>,chain_ID: C


In [40]:
for model in structure:
    print(model)
    for chain in model:
        print(chain)
        for residue in chain:
            for atom in residue:
                print(atom)

tom CG>
<Atom CD>
<Atom NE>
<Atom CZ>
<Atom NH1>
<Atom NH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom CE>
<Atom NZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom ND2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom ND1>
<Atom CD2>
<Atom CE1>
<Atom NE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom ND2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom NE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom ND2>
<Atom N>
<Atom CA>
<Atom C

## Visualize Structure

In [None]:
#still Learning How to do
##Comming Soon