날짜: 2021/02/23                     
작성자: 장우혁                      
내용: sequences.csv에 변이를 추가하는 내용. 출력 파일 mutations.csv

In [1]:
from Bio import AlignIO
from Bio import SeqIO
import pandas as pd

## 1. 코로나 S 단백질 아미노산 서열 데이터 

In [2]:
# ref seq
refseq = ()
with open("../../data/before_preprocessing/refseq.fasta", "r", encoding='utf-8') as file:
    refseq = SeqIO.read(file, "fasta").seq
print(refseq)

MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGR

In [3]:
# 파일 열기
with open("../../data/before_preprocessing/alignments.fas", "r", encoding='utf-8') as file:
    alignments = AlignIO.read(file, "fasta")

# accession code 저장
accessions = [i.id[:8] for i in alignments]
print(accessions[0])

# 길이 확인
length = len(alignments)
print(length)  # 40939 sequences


QRJ61643
40939


In [4]:
# 돌연변이 찾기
mutations = {}   # {돌연변이 아미노산 위치: 돌연변이가 나타나는 seq의 위치}

# 키 추가, 바이러스 추가

for seq_num, record in enumerate(alignments):
    for idx in range(len(refseq)):
        if record[idx] != refseq[idx]:
            mutation_name = f"{refseq[idx]}{idx+1}{record[idx]}"
            
            if mutation_name in mutations.keys():
                mutations[mutation_name].append(seq_num)
            else:
                mutations[mutation_name] = []

print(mutations['I402V'])


[12635, 12636, 12637, 12638, 12639, 12640, 12641, 12648]


In [5]:
# 특정 기준 이상 출현하는 돌연변이 추출 (20개 이상)
min_count = 20
mutation_count = [ (mutation_name, len(list)) for mutation_name, list in mutations.items() if len(list) > min_count and mutation_name[-1] != 'X']
mutation_list = [i[0] for i in mutation_count]

idx = 0
for mutation, count in mutation_count:
    print(f"{mutation}:\t{count}회")

    idx += 1
    if idx > 10:
        break

D614G:	37098회
T1117I:	42회
V1228L:	112회
G142S:	40회
S939F:	78회
N501T:	117회
L5F:	522회
L18F:	115회
M1229I:	49회
Q677P:	101회
P330S:	26회


In [9]:
print(len(mutation_list))

134


## 2. Sequences.csv 파일

코로나 아미노산 염기서열에 대한 메타데이터를 담고 있는 파일이다.

In [6]:
# csv 파일 읽어오기
df = pd.read_csv("../../data/before_preprocessing/sequences.csv")
print(len(df))

40840


In [7]:
# 열 추가
for mutation in mutation_list:
    names = [alignments[i].id[:8] for i in mutations[mutation]]
    df[mutation] = pd.Series(df["Accession"].isin(names))

df

Unnamed: 0.1,Unnamed: 0,Accession,Species,Length,Nuc_Completeness,Protein,Geo_Location,Isolation_Source,Collection_Date,D614G,...,Q52H,V1164F,A1070S,P1112L,G181R,T632N,A930V,V320I,L822F,A845D
0,100,QRU93230,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Massachusetts,,2021-01-16,True,...,False,False,False,False,False,False,False,False,False,False
1,101,QRU93242,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Massachusetts,,2021-01-17,True,...,False,False,False,False,False,False,False,False,False,False
2,102,QRU93254,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Massachusetts,,2021-01-18,True,...,False,False,False,False,False,False,False,False,False,False
3,103,QRU93266,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Massachusetts,,2021-01-27,True,...,False,False,False,False,False,False,False,False,False,False
4,104,QRU93278,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Massachusetts,,2021-01-16,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40835,41802,QHO62877,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA: Illinois,"lung, oronasopharynx",2020-01-21,False,...,False,False,False,False,False,False,False,False,False,False
40836,41803,QHN73795,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,China: Shenzhen,oronasopharynx,2020-01-10,False,...,False,False,False,False,False,False,False,False,False,False
40837,41804,QHN73810,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,China,"lung, oronasopharynx",2020-01-11,False,...,False,False,False,False,False,False,False,False,False,False
40838,41805,QHO60594,Severe acute respiratory syndrome-related coro...,1273,complete,surface glycoprotein,USA,oronasopharynx,2020-01-19,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# df 파일에 저장
df.to_csv('../../data/after_preprocessing/mutations.csv')